In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
from keras.optimizers import Adam

import pickle

import warnings

# Load and clean data

In [22]:
def convertDates(series):
    monthDict = {"Jan" : 1, "Feb" : 2, "Mar" : 3, "Apr" : 4, "May" : 5, "Jun" : 6, "Jul" : 7, "Aug" : 8, "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12}
    dates = []
    for val in series:
#         print(val)
        if (type(val) != type(0.1)):
            date = val.split("-")
            numDate = int(date[0]) + 100 * monthDict[date[1]] + 10000 * int(date[2])
            dates.append(numDate)
        else:
            dates.append(-1)
    return dates

# load all data
data0 = pd.read_table("./archive/ml-100k/u.data", delimiter="\t", names=["userId", "itemId", "rating", "timestamp"])
users0 = pd.read_table("./archive/ml-100k/u.user", delimiter="|", names=["userId", "age", "gender", "occupation", "zip"])
movies0 = pd.read_table("./archive/ml-100k/u.item", delimiter="|", names=["movieId", "title", "movieRelease", "videoRelease", "imbd", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western"])

# create copies of data to clean
movies = movies0.drop(["videoRelease", "imbd", "movieRelease"], axis=1).dropna()
# movies.movieRelease = convertDates(movies.movieRelease)

In [23]:
movies.head()

Unnamed: 0,movieId,title,unknown,action,adventure,animation,children,comedy,crime,documentary,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


# Cluster the movies

In [24]:
# create X
X = movies.to_numpy()[:, 2:]

maxK = 0
maxScore = 0
maxModel = 0

for k in range(10, 21):
    kmeans = KMeans(n_clusters = k)
    print(k)
    kmeans.fit(X)
    score = silhouette_score(X, kmeans.labels_, metric = 'euclidean')
    if score > maxScore:
        maxScore = score
        maxModel = kmeans
        maxK = k
        
movies["Cluster"] = maxModel.predict(X)
movies.head()

10
11
12
13
14
15
16
17
18
19
20


Unnamed: 0,movieId,title,unknown,action,adventure,animation,children,comedy,crime,documentary,...,noir,horror,musical,mystery,romance,scifi,thriller,war,western,Cluster
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,4
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,8
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,2
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,11
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,12


In [25]:
movies.Cluster.value_counts()

1     427
0     273
10    123
17     82
2      81
11     78
8      66
3      61
6      59
12     57
5      52
13     48
9      47
14     42
7      41
4      38
16     33
18     29
15     24
19     21
Name: Cluster, dtype: int64

# Generate reccomendations

In [79]:
# grab random user
user = np.random.choice(users0.userId)
# grab all movies ranked by that user
data = data0[data0.userId == user].drop(["userId", "timestamp"], axis=1)

# find cluster for each movie
clusters = []
for item in data.itemId:
    clusters.append(int(movies[movies["movieId"] == item].Cluster))

data["Clusters"] = clusters

# compute the averages for each cluster
warnings.filterwarnings('ignore')
averages = np.zeros((maxK))
for i in range(maxK):
    rated = data[data["Clusters"] == i].rating.to_numpy()
    if len(rated.flatten()) != 0:
        averages[i] = np.mean(rated)
    else:
        averages[i] = 0

In [83]:
favoredClusters = np.where(averages > 3.5)[0]
favoriteCluster = np.argmax(averages)
recommendations = movies[movies["Cluster"] == favoredClusters[0]].to_numpy()
favorites = movies[movies["Cluster"] == favoriteCluster]
for c in favoredClusters[1:]:
    likedMovies = movies[movies["Cluster"] == c].to_numpy()
    recommendations = np.vstack((recommendations, likedMovies))
    
recommendations = pd.DataFrame(recommendations, columns=movies.columns)
favorites[["movieId", "title"]].head(30)

Unnamed: 0,movieId,title
1,2,GoldenEye (1995)
20,21,Muppet Treasure Island (1996)
23,24,Rumble in the Bronx (1995)
28,29,Batman Forever (1995)
49,50,Star Wars (1977)
61,62,Stargate (1994)
81,82,Jurassic Park (1993)
100,101,Heavy Metal (1981)
109,110,Operation Dumbo Drop (1995)
116,117,"Rock, The (1996)"


In [86]:
recommendations[["movieId", "title"]].head(10)

Unnamed: 0,movieId,title
0,13,Mighty Aphrodite (1995)
1,25,"Birdcage, The (1996)"
2,26,"Brothers McMullen, The (1995)"
3,40,"To Wong Foo, Thanks for Everything! Julie Newm..."
4,41,Billy Madison (1995)
5,42,Clerks (1994)
6,67,Ace Ventura: Pet Detective (1994)
7,72,"Mask, The (1994)"
8,80,Hot Shots! Part Deux (1993)
9,85,"Ref, The (1994)"


In [82]:
np.argmax(averages)

8

In [80]:
averages

array([4.5       , 3.84615385, 3.25      , 4.        , 3.        ,
       0.        , 0.        , 4.        , 4.6       , 0.        ,
       4.        , 4.        , 4.33333333, 3.5       , 3.        ,
       0.        , 3.5       , 2.        , 0.        , 0.        ])