In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import re
import seaborn as sns
from datetime import datetime

#Sklearn
from sklearn.metrics.pairwise import cosine_similarity

#Graphic library
from wordcloud import WordCloud

In [2]:
df_item = pd.read_csv('ML_items.csv')
df_inter = pd.read_csv('ML_interactions.csv')
df_user = pd.read_csv('ML_users.csv')

In [3]:
df_item_num = df_item.drop(['Title','ReleaseDate', 'VideoReleaseDate', 'URL'], axis=1)
mean_item = df_item.mean()
std_item = df_item.std()
z_score_item = abs(df_item_num - mean_item) / mean_item

  mean_item = df_item.mean()
  std_item = df_item.std()


In [4]:
df_item.shape

(1682, 24)

In [5]:
#Delete feature "VideoReleaseDate" and 'URL'.
df_item = df_item.drop(['VideoReleaseDate', 'URL'], axis=1)

# df_item = df_item.dropna()

In [6]:
df_item.shape

(1682, 22)

**That is to say: these two lines contains the same movie but with different MovieId.**

In [7]:
df_item_use = df_item.copy()

In [8]:
df_inter

Unnamed: 0,UserId,MovieId,Ratings,Timestamps
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


## Algorithm 1:  Character Matching

In [9]:
df_item.head()

Unnamed: 0,Id,Title,ReleaseDate,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


**That is to say: these two lines contains the same movie but with different MovieId.**

Create a dataset with the following columns: <br/>
<li>Id</li>
<li>Title</li>
<li>ReleaseDate</li>
<li>Type</li>

**De-ont-hot encoding**

In [10]:
movie_type = df_item.iloc[:, 3:].idxmax(1)

In [11]:
df_item_list = df_item.iloc[:, :3] 
df_item_list['ReleaseDate'] = pd.to_datetime(df_item_list['ReleaseDate'])
df_item_list['Type'] = movie_type

In [12]:
df_item_list.head()

Unnamed: 0,Id,Title,ReleaseDate,Type
0,1,Toy Story (1995),1995-01-01,Animation
1,2,GoldenEye (1995),1995-01-01,Action
2,3,Four Rooms (1995),1995-01-01,Thriller
3,4,Get Shorty (1995),1995-01-01,Action
4,5,Copycat (1995),1995-01-01,Crime


In [13]:
types = list(df_item_list['Type'].value_counts().index)

In [14]:
max(df_item_list['ReleaseDate'].dt.year)

1998.0

In [15]:
# while True:
#     try:
#         input_year = int(input("Please input the year you wanna search from:"))
#         if input_year > max(df_item_list['ReleaseDate'].dt.year):
#             print("There is no movies later than this year.\n Please input again")
#             continue
#         input_type = str(input("Please input the type of movie you wanna search:"))
#     except ValueError:
#         print('Please input a valid year/type but not leave it blank.')
#         continue

#     except:
#         print('Check other possibilities.')
#         continue
    
    
    
#     if input_type not in types:
#         print("This type is not exist, please input again.")
#         continue

#     df_search = df_item_list[(df_item_list['ReleaseDate'].dt.year > input_year) & (df_item_list['Type'] == input_type)]
#     print("Here is the search result: \n", df_search)
#     break

## Algorithm 2: Rank the frequency of watching ordered by month.

In [16]:
df_inter_new = df_inter.iloc[:, :2]

In [17]:
df_inter_new['MovieId'].value_counts()

50      583
258     509
100     508
181     507
294     485
       ... 
852       1
1505      1
1653      1
1452      1
1641      1
Name: MovieId, Length: 1682, dtype: int64

In [18]:
dates = df_inter['Timestamps'].apply(lambda x: datetime.fromtimestamp(x).strftime('%Y-%m-%d'))
years, months = pd.to_datetime(dates).dt.year, pd.to_datetime(dates).dt.month
df_inter_new['Year'] = years
df_inter_new['Month'] = months

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inter_new['Year'] = years


In [19]:
df_inter_new.groupby(['MovieId', 'Month'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fd300563220>

In [20]:
df_inter_group = df_inter_new[['MovieId', 'Month']].value_counts().groupby(['MovieId']).apply(lambda x: x)

In [21]:
pd.DataFrame(df_inter_group)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
MovieId,Month,Unnamed: 2_level_1
50,11,154
100,11,135
181,11,134
286,11,128
1,11,122
...,...,...
1102,2,1
1100,11,1
1100,10,1
1100,4,1


## Algorithm 3: the calculation of cosine similarities

In [22]:
df_item_al3 = df_item.iloc[:, 3:]
df_item_al3.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [23]:
df_item_al3.iloc[0].to_numpy().reshape(1, -1)

array([[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [24]:
#Cosine Similarity
#Line being compared
num = 0
X = df_item_al3.iloc[0].to_numpy()
X = np.reshape(X, (1, -1))

for index, row in df_item_al3.iterrows():
    Y = row.to_numpy()
    Y = np.reshape(Y, (1, -1))
    cos_sim = cosine_similarity(X, Y)
    print('The cosine similarity of movie 1 and movie {0} is: {1}.'.format(index+1, cos_sim))
    num +=1
    if num >= 4:
        break

The cosine similarity of movie 1 and movie 1 is: [[1.]].
The cosine similarity of movie 1 and movie 2 is: [[0.]].
The cosine similarity of movie 1 and movie 3 is: [[0.]].
The cosine similarity of movie 1 and movie 4 is: [[0.33333333]].


In [25]:
#For the whole dataset
cos_sim_all = cosine_similarity(df_item_al3)
np.fill_diagonal(cos_sim_all, 0)
cos_sim_all = pd.DataFrame(cos_sim_all)

In [26]:
pd.DataFrame(cos_sim_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.57735,0.000000
1,0.000000,0.000000,0.577350,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.816497,0.000000,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.00000,0.000000
2,0.000000,0.577350,0.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.00000,0.000000
3,0.333333,0.333333,0.000000,0.000000,0.333333,0.577350,0.408248,0.666667,0.577350,0.408248,...,0.408248,0.577350,0.577350,0.577350,0.577350,0.577350,0.000000,0.408248,0.57735,0.577350
4,0.000000,0.333333,0.577350,0.333333,0.000000,0.577350,0.408248,0.333333,0.577350,0.408248,...,0.408248,0.577350,0.577350,0.577350,0.577350,0.577350,0.408248,0.408248,0.00000,0.577350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.000000,0.000000,0.000000,0.577350,0.577350,1.000000,0.707107,0.577350,1.000000,0.707107,...,0.000000,1.000000,1.000000,1.000000,1.000000,0.000000,0.000000,0.707107,0.00000,1.000000
1678,0.000000,0.408248,0.707107,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.00000,0.000000
1679,0.000000,0.000000,0.000000,0.408248,0.408248,0.707107,0.500000,0.408248,0.707107,0.500000,...,0.000000,0.707107,0.707107,0.707107,0.707107,0.707107,0.500000,0.000000,0.00000,0.707107
1680,0.577350,0.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.577350,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000


In [27]:
print('The TOP10 similar movies with movie0:{}'.format(np.argsort(cos_sim_all.iloc[0, :]).to_numpy()[::-1][:10]))

The TOP10 similar movies with movie0:[ 421 1218   94  260  403  928  101  224  168 1031]


In [28]:
df_inter_new

Unnamed: 0,UserId,MovieId,Year,Month
0,196,242,1997,12
1,186,302,1998,4
2,22,377,1997,11
3,244,51,1997,11
4,166,346,1998,2
...,...,...,...,...
99995,880,476,1997,11
99996,716,204,1997,11
99997,276,1090,1997,9
99998,13,225,1997,12


In [29]:
df_merge_iter = df_inter_new.drop(['Year', "Month"],axis=1).merge(df_item.drop(['ReleaseDate'], axis=1), left_on='MovieId', right_on='Id')
df_merge_iter = df_merge_iter.drop(['Id'], axis=1)

In [30]:
df_merge_iter.head()

Unnamed: 0,UserId,MovieId,Title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,Kolya (1996),0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,Kolya (1996),0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,Kolya (1996),0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,Kolya (1996),0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,Kolya (1996),0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
movie_id1 = df_merge_iter[df_merge_iter['UserId'] == 1]

In [32]:
#User profile
user1_pre_rate = df_merge_iter[df_merge_iter['UserId'] == 1].mean()[2:]

  user1_pre_rate = df_merge_iter[df_merge_iter['UserId'] == 1].mean()[2:]


In [33]:
user1_pre_rate.shape

(19,)

In [34]:
u1_watched = df_merge_iter[df_merge_iter['UserId'] == 1]['MovieId'].values

In [35]:
df_item_new = df_item.drop(columns=['ReleaseDate'])
df_item_new

Unnamed: 0,Id,Title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
#Delete all the movies user1 watched
df_item_new = df_item_new.drop(df_item_new[df_item_new['Id'].apply(lambda x: x in u1_watched)].index)

In [37]:
df_item_new

Unnamed: 0,Id,Title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
272,273,Heat (1995),0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
273,274,Sabrina (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
274,275,Sense and Sensibility (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
275,276,Leaving Las Vegas (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
276,277,Restoration (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
#Similarity of user's profile and all the movies.
# user1_pre_rate = user1_pre_rate.to_numpy().reshape(1, -1)
sim_u1 = cosine_similarity([user1_pre_rate], df_item_new.iloc[:, 2:]).reshape(-1, 1)
df_sim_u1 = pd.DataFrame(df_item_new.iloc[:, :2])
df_sim_u1['Similarity'] = sim_u1
df_sim_u1

Unnamed: 0,Id,Title,Similarity
272,273,Heat (1995),0.462156
273,274,Sabrina (1995),0.502717
274,275,Sense and Sensibility (1995),0.562299
275,276,Leaving Las Vegas (1995),0.562299
276,277,Restoration (1995),0.563494
...,...,...,...
1677,1678,Mat' i syn (1997),0.563494
1678,1679,B. Monkey (1998),0.357488
1679,1680,Sliding Doors (1998),0.562299
1680,1681,You So Crazy (1994),0.479233


In [39]:
recommend_u1 = df_sim_u1.sort_values("Similarity", inplace=False, ascending=False)[:10]


In [40]:
recommend_u1

Unnamed: 0,Id,Title,Similarity
1137,1138,Best Men (1997),0.784678
336,337,"House of Yes, The (1997)",0.760124
1295,1296,Indian Summer (1996),0.737319
1236,1237,Twisted (1996),0.737319
736,737,Sirens (1994),0.737319
1294,1295,Kicked in the Head (1997),0.737319
1338,1339,Stefano Quantestorie (1993),0.737319
1102,1103,Trust (1990),0.737319
597,598,"Big Squeeze, The (1996)",0.737319
1587,1588,Salut cousin! (1996),0.737319


In [41]:
#Recommend him new movies except for the ones he watched.
print("Books will be recommended to user1 are: \n {0}".format(recommend_u1['Title'].apply(lambda x: x)))

Books will be recommended to user1 are: 
 1137                Best Men (1997)
336        House of Yes, The (1997)
1295           Indian Summer (1996)
1236                 Twisted (1996)
736                   Sirens (1994)
1294      Kicked in the Head (1997)
1338    Stefano Quantestorie (1993)
1102                   Trust (1990)
597         Big Squeeze, The (1996)
1587           Salut cousin! (1996)
Name: Title, dtype: object


## Algorithm 4: Collaborative filtering

In [42]:
df_inter.shape

(100000, 4)

### 4.1

In [43]:
#Create a pivot to set 'UserId' as the index, 'MovieId' as the column and "ratings" as values.
#Then fill all the Nan values with 0.
df_pivot = df_inter[['UserId', 'MovieId', 'Ratings']].pivot(index='UserId', columns="MovieId", values='Ratings').fillna(0)

In [44]:
df_pivot

MovieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4.2

In [45]:
#Calculate the cosine similarity of this pivot. It will use the index to calculate the cs with values by default.
sim_al4 = cosine_similarity(df_pivot)

In [46]:
#Then change all values diagonally with 0
np.fill_diagonal(sim_al4, 0)
#Transform it to a dataframe
sim_al4 = pd.DataFrame(sim_al4)

In [47]:
sim_al4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,0.000000,0.166931,0.047460,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,0.000000,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.226790,0.161485,0.172268,0.105798
2,0.047460,0.110591,0.000000,0.344151,0.021245,0.072415,0.066137,0.083060,0.061040,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.161890,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,0.000000,0.031804,0.068044,0.091230,0.188060,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,0.000000,0.237286,0.373600,0.248930,0.056847,0.201427,...,0.338794,0.080580,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.118095,0.228583,0.026271,0.030138,0.071459,0.111852,0.107027,0.095898,0.039852,0.071460,...,0.066039,0.431154,0.258021,0.226449,0.432666,0.000000,0.087687,0.180029,0.043264,0.144250
939,0.314072,0.226790,0.161890,0.196858,0.239955,0.352449,0.329925,0.246883,0.120495,0.342961,...,0.327153,0.107024,0.187536,0.181317,0.175158,0.087687,0.000000,0.145152,0.261376,0.241028
940,0.148617,0.161485,0.101243,0.152041,0.139595,0.144446,0.059993,0.146145,0.143245,0.090305,...,0.046952,0.203301,0.288318,0.234211,0.313400,0.180029,0.145152,0.000000,0.101642,0.095120
941,0.179508,0.172268,0.133416,0.170086,0.152497,0.317328,0.282003,0.175322,0.092497,0.212330,...,0.226440,0.073513,0.089588,0.129554,0.099385,0.043264,0.261376,0.101642,0.000000,0.182465


### 4.3

In [48]:
# Sort the first column by the similarity, get the TOP 20 userId that has the biggest similarity with User1.
uid_al4_u1 = sim_al4[0].sort_values(ascending=False)[:20].index

In [49]:
uid_al4_u1

Int64Index([915, 863, 267,  91, 434, 456, 737, 428, 302, 275, 888, 822, 386,
            513, 292, 681, 885, 726, 300, 416],
           dtype='int64')

In [50]:
df_item.shape

(1682, 22)

In [51]:
#Get the idList and info of those TOP20 users.
df_user[df_user['Id'].apply(lambda x: True if x in uid_al4_u1 else False)].reindex(uid_al4_u1 - 1)

Unnamed: 0,Id,Age,Sex,Occupation,Code
914,915,50,M,entertainment,60614
862,863,17,M,student,60089
266,267,23,M,engineer,83716
90,91,55,M,marketing,1913
433,434,16,F,student,49705
455,456,24,M,technician,31820
736,737,30,M,programmer,98072
427,428,28,M,student,55414
301,302,42,M,educator,77904
274,275,38,M,engineer,92064


### 4.4

In [52]:
#Get the pivot row for TOP20 similar users with User1 (rows of information)
df_u1_pivot = df_pivot.iloc[uid_al4_u1, :]
#To replace 0s with Nan.
df_u1_pivot = df_u1_pivot.replace(0, np.NaN)

In [53]:
avg_ratings = np.nanmean(df_u1_pivot, axis=0)

  avg_ratings = np.nanmean(df_u1_pivot, axis=0)


In [54]:
avg_ratings

array([4.  , 3.25, 2.75, ...,  nan,  nan, 3.  ])

In [55]:
avg_ratings.shape

(1682,)

In [56]:
u1_ratings = pd.DataFrame(df_item['Id'])
u1_ratings['avg_rating'] = np.round(avg_ratings, 2)
u1_ratings

Unnamed: 0,Id,avg_rating
0,1,4.00
1,2,3.25
2,3,2.75
3,4,3.79
4,5,3.09
...,...,...
1677,1678,
1678,1679,
1679,1680,
1680,1681,


In [57]:
#Get the TOP 20 highest ratings of items for User1
u1_recommend_id = u1_ratings.sort_values('avg_rating', ascending=False)[:20]['Id']

In [58]:

df_item_use[df_item_use['Id'].apply(lambda x: True if x in u1_recommend_id else False)].reindex(u1_recommend_id)

Unnamed: 0_level_0,Id,Title,ReleaseDate,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
602,,,,,,,,,,,...,,,,,,,,,,
1007,1008.0,I Shot Andy Warhol (1996),01-May-1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
492,,,,,,,,,,,...,,,,,,,,,,
853,,,,,,,,,,,...,,,,,,,,,,
1168,,,,,,,,,,,...,,,,,,,,,,
1143,,,,,,,,,,,...,,,,,,,,,,
285,,,,,,,,,,,...,,,,,,,,,,
641,,,,,,,,,,,...,,,,,,,,,,
114,,,,,,,,,,,...,,,,,,,,,,
1589,,,,,,,,,,,...,,,,,,,,,,


## Algorithm 5: Item-based recommendation

### 5.1

In [59]:
al5_pivot_cos = df_inter[['UserId', 'MovieId', 'Ratings']].pivot(index='MovieId', columns='UserId', values='Ratings').fillna(0)


UserId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
al5_cos = cosine_similarity(al5_pivot_cos)
np.fill_diagonal(al5_cos, 0)

In [61]:
al5_cos = pd.DataFrame(al5_cos)

In [62]:
al5_cos

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.000000,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.000000,0.000000,0.035387,0.0,0.0,0.0,0.047183,0.047183
1,0.402382,0.000000,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.078299,0.078299
2,0.330245,0.273069,0.000000,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.000000,0.0,0.000000,0.000000,0.032292,0.0,0.0,0.0,0.000000,0.096875
3,0.454938,0.502571,0.324866,0.000000,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.000000,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
4,0.286714,0.318836,0.212957,0.334239,0.000000,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.094211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.0,1.0,0.000000,0.000000
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,1.0,0.0,1.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,0.0,0.000000,0.000000
1680,0.047183,0.078299,0.000000,0.056413,0.000000,0.000000,0.051498,0.082033,0.057360,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000


In [103]:
al5_cos[0][1]

0.40238217829960954

In [80]:
len(al5_cos)

1682

### 5.2

In [76]:
#Select ratings for all movies given by User1
ratings_u1_al5 = df_pivot.iloc[0, :]

In [78]:
#Select out the movieId that User1 didn't give ratings (rating = 0).
u1_unseen_id_al5 = ratings_u1_al5[ratings_u1 == 0].index

In [107]:
#Predict the missing ratings of User1 by calculating the ∑(weight(rating) * cosine_similarity) / ∑(cosine_similarity)
for row in range(len(al5_cos)): 
    # If the id is not in the unseen list, replace it will the result of calculation
    if row not in u1_unseen_id_al5:
        weighted_sum, sim_sum = 0, 0
        for col in range(len(al5_cos)):
            weighted_sum += ratings_u1_al5 * al5_cos[row]
            sim_sum += al5_cos[row][col]
        ratings_u1_al5[row] = weighted_sum / sim_sum
    else:
        continue

KeyboardInterrupt: 

In [99]:
ratings_u1_al5[1000:1050]

MovieId
1001    0.0
1002    0.0
1003    0.0
1004    0.0
1005    0.0
1006    0.0
1007    0.0
1008    0.0
1009    0.0
1010    0.0
1011    0.0
1012    0.0
1013    0.0
1014    0.0
1015    0.0
1016    0.0
1017    0.0
1018    0.0
1019    0.0
1020    0.0
1021    0.0
1022    0.0
1023    0.0
1024    0.0
1025    0.0
1026    0.0
1027    0.0
1028    0.0
1029    0.0
1030    0.0
1031    0.0
1032    0.0
1033    0.0
1034    0.0
1035    0.0
1036    0.0
1037    0.0
1038    0.0
1039    0.0
1040    0.0
1041    0.0
1042    0.0
1043    0.0
1044    0.0
1045    0.0
1046    0.0
1047    0.0
1048    0.0
1049    0.0
1050    0.0
Name: 1, dtype: float64

In [98]:
al5_pivot_cos.iloc[1000:1050,0]

MovieId
1001    0.0
1002    0.0
1003    0.0
1004    0.0
1005    0.0
1006    0.0
1007    0.0
1008    0.0
1009    0.0
1010    0.0
1011    0.0
1012    0.0
1013    0.0
1014    0.0
1015    0.0
1016    0.0
1017    0.0
1018    0.0
1019    0.0
1020    0.0
1021    0.0
1022    0.0
1023    0.0
1024    0.0
1025    0.0
1026    0.0
1027    0.0
1028    0.0
1029    0.0
1030    0.0
1031    0.0
1032    0.0
1033    0.0
1034    0.0
1035    0.0
1036    0.0
1037    0.0
1038    0.0
1039    0.0
1040    0.0
1041    0.0
1042    0.0
1043    0.0
1044    0.0
1045    0.0
1046    0.0
1047    0.0
1048    0.0
1049    0.0
1050    0.0
Name: 1, dtype: float64

In [None]:
ratings_u1_a5