In [1]:
import pandas as pd

In [2]:
movies = pd.read_json("movies.json")
reviews = pd.read_json("reviews.json")

In [3]:
movies.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
reviews.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,1997-12-04 15:55:49
1,0,172,5,1997-12-04 15:55:49
2,0,133,1,1997-12-04 15:55:49
3,196,242,3,1997-12-04 15:55:49
4,186,302,3,1998-04-04 19:22:22


In [5]:
reviews = reviews.drop(['timestamp'], axis=1)

In [6]:
# Wrangle plan
# 1. Get the N movies that have the most reviews (reduces dimensionality and that's important when thinking about euclidean distance)
# 2. Use these top movies to cluster users based on their rating similarities
# 3. Make a recommendation based on what other users in the cluster have reviewed highly

In [None]:
# get counts of each unique movie
# Note this will take about 30 minutes
counts = []
for movie in range(1682):
    tally = 0
    for idx in range(len(reviews)):
        if reviews.iloc[idx]['item_id'] == movie:
            tally = tally + 1
    counts.append(tally)
    print("Movie " + str(movie) + " done!")

In [8]:
#Let's do this instead
pivotDF = reviews.pivot(index='user_id', columns='item_id', values='rating')
pivotDF.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# Get the item_id with count
mostReviewed = 3
topN = reviews.groupby('item_id').size()

In [29]:
topN.head()

item_id
1    450
2    131
3     90
4    208
5     86
dtype: int64

In [None]:
# Get a dataframe with only topN movies included
topOnlyDF = reviews[reviews["item_id"].isin(topN["item_id"])]

In [None]:
topOnlyDF.head() # Need that pivot again

In [None]:
topOnlyDF = topOnlyDF.pivot(index='user_id', columns='item_id', values='rating')

In [None]:
# People who don't review anything aren't useful to us. Let's drop them.
topOnlyDF = topOnlyDF.dropna(axis=0, how='all')

In [None]:
# The people who are left might still have NaNs. Let's fill those with a number so we can cluster!
topOnlyDF = topOnlyDF.fillna(-1)

In [None]:
topOnlyDF.head()

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
agg = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
agg = agg.fit(topOnlyDF)
agg.labels_

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# We want something as close to 1 as possible
ss_avg = silhouette_score(topOnlyDF, agg.labels_)

In [None]:
ss_avg # Can we do better?

In [None]:
scores = {}
clusters = len(topOnlyDF)
for i in range(2, clusters):
    agg = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward')
    agg = agg.fit(topOnlyDF)
    ss_avg = silhouette_score(topOnlyDF, agg.labels_)
    scores[i] = ss_avg

In [None]:
import seaborn as sns
sns.scatterplot(x=scores.keys(), y=scores.values()) # Somewhere around 180

In [None]:
# choose a n_cluster value
agg = AgglomerativeClustering(n_clusters=180, affinity='euclidean', linkage='ward')
agg = agg.fit(topOnlyDF)

In [None]:
topOnlyDF["clusters"] = agg.labels_
topOnlyDF.reset_index(inplace=True)
topOnlyDF.head()

In [None]:
# Let's look at a specific user right now
user = 365
topOnlyDF[topOnlyDF['user_id'] == user]

In [None]:
# Now, let's get users in same cluster
cluster = 77
user_ids = topOnlyDF[topOnlyDF['clusters'] == cluster]
user_ids

In [None]:
reviews[reviews["user_id"] == 24]

In [None]:
u365 = reviews[reviews["user_id"] == 365]

In [None]:
u365[u365['item_id'] == 64]

In [None]:
movies[movies["item_id"] == 64]