In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
from keras.optimizers import Adam

import pickle

# Step 1
### Importing the data

In [2]:
# load all data
data0 = pd.read_table("./archive/ml-100k/u.data", delimiter="\t", names=["userId", "itemId", "rating", "timestamp"])
users0 = pd.read_table("./archive/ml-100k/u.user", delimiter="|", names=["userId", "age", "gender", "occupation", "zip"])
movies0 = pd.read_table("./archive/ml-100k/u.item", delimiter="|", names=["movieId", "title", "movieRelease", "videoRelease", "imbd", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western"])

# Step 2:
### Cleaning the data

In [3]:
def convertDates(series):
    monthDict = {"Jan" : 1, "Feb" : 2, "Mar" : 3, "Apr" : 4, "May" : 5, "Jun" : 6, "Jul" : 7, "Aug" : 8, "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12}
    dates = []
    for val in series:
#         print(val)
        if (type(val) != type(0.1)):
            date = val.split("-")
            numDate = int(date[0]) + 100 * monthDict[date[1]] + 10000 * int(date[2])
            dates.append(numDate)
        else:
            dates.append(-1)
    return dates

def convertZips(series):
    zips = []
    for val in series:
        try:
            zips.append(int(val))
        except:
            zips.append(-1)
    return zips




In [4]:
# clean data
data = data0.drop("timestamp", axis=1)
data.head()

Unnamed: 0,userId,itemId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
# replace type of zip
users = users0.copy()
users.zip = convertZips(users.zip)
# replace gender for numbers
users.gender.replace({"M" : -1, "F" : 1}, inplace=True)
# replace occupations
users.drop(["occupation", "zip"], axis=1, inplace=True)

users.head()

Unnamed: 0,userId,age,gender
0,1,24,-1
1,2,53,1
2,3,23,-1
3,4,24,-1
4,5,33,1


In [6]:
# drop imbd and videoRelease and title
movies = movies0.drop(["videoRelease", "imbd", "title", "movieRelease"], axis=1)
# change release date
# movies.movieRelease = convertDates(movies.m|ovieRelease)

movies.head()

Unnamed: 0,movieId,unknown,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [7]:
xUsers = users.to_numpy()[:, 1:]
xMovies = movies.to_numpy()[:, 1:]
xData = data.to_numpy()[:, 2]

rows = len(data)
cols = np.shape(xUsers)[1] + np.shape(xMovies)[1] + 1

X = np.zeros((rows, cols))

userIds = data.userId.values
movieIds = data.itemId.values
ratings = data.rating.values

for i in range(rows):
#     print(i)
    u = userIds[i]
    m = movieIds[i]
    X[i] = np.hstack((xUsers[u - 1, :], xMovies[m - 1, : ], ratings[i]))

DATA = pd.DataFrame(X, columns=["age", "gender", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western", "rating"])

X = DATA.to_numpy()
DATA.head()

Unnamed: 0,age,gender,unknown,action,adventure,animation,children,comedy,crime,documentary,...,noir,horror,musical,mystery,romance,scifi,thriller,war,western,rating
0,49.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,39.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0
2,25.0,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,28.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0
4,47.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Step 3
### Splitting the Data

In [8]:
points = X[:, : -1]
targets = X[:, -1:]
# targets = OneHotEncoder().fit(targets).transform(targets).toarray()

xTrain, xTest, yTrain, yTest = train_test_split(points, targets, test_size=0.2)

In [9]:
yTrain

array([[4.],
       [5.],
       [4.],
       ...,
       [5.],
       [2.],
       [4.]])

# Step 4
### Training the Model

In [12]:

try:
    1 / 0
    model = load_model("myModel")
except:
    model = Sequential()
    model.add(Dense(16, input_dim=21, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    myOptimizer = Adam(learning_rate=0.001)
    model.compile(loss="binary_crossentropy", optimizer=myOptimizer, metrics=["accuracy"])
    model.fit(xTrain, yTrain, epochs=30, batch_size=50)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

KeyboardInterrupt: 

In [None]:
# save model
model.save("myModel")

In [None]:
# predict
yPredict = model.predict(xTest)

accuracy_score(yTest, yPredict > 0.5)
# yPredict > 0.5

# Step 5
### Clustering users

In [None]:
filename = 'finalized_kmeans.sav'
try:
    maxModel = pickle.load(open(filename, "rb"))
    loaded = True
except:
    loaded = False

In [None]:
if not loaded:
    maxScore = 0
    maxModel = 0 # dummy value

    for i in range(2, 5):
        kmeans = KMeans(n_clusters = i)
        print(i)
        kmeans.fit(X[:, 0:4])
        score = silhouette_score(X[:, 0:4], kmeans.labels_, metric = 'euclidean')
        if score > maxScore:
            maxScore = score
            maxModel = kmeans

In [None]:
if not loaded:
    for i in range(5, 11):
        kmeans = KMeans(n_clusters = i)
        print(i)
        kmeans.fit(X[:, 0:4])
        score = silhouette_score(X[:, 0:4], kmeans.labels_, metric = 'euclidean')
        if score > maxScore:
            maxScore = score
            maxModel = kmeans

In [None]:
pickle.dump(maxModel, open(filename, 'wb'))

In [None]:
predictions = maxModel.predict(X[:, 0:4])
DATA["Cluster"] = predictions
DATA.head()

In [None]:
DATA.Cluster.value_counts()

In [None]:
movie = np.random.choice(movies.movieId)
user = np.random.choice(users.userId)

user = users[users["userId"] == user].drop("userId", axis=1)
movie = movies[movies["movieId"] == movie].drop("movieId", axis=1)

# create feature vector
feature = np.hstack((user, movie))
prediction = model.predict(feature)
prediction = prediction > 0.5
cluster = maxModel.predict(user)[0]

if prediction:
    print("Recommend to all users with cluster {}".format(cluster))