In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
user = pd.read_csv("user.csv")
movies = pd.read_csv("movies.csv")

In [3]:
user.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
movies.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
df = pd.merge(user, movies, on="item_id")

In [6]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [7]:
df.drop("timestamp", axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,user_id,item_id,rating,title
0,0,50,5,Star Wars (1977)
1,290,50,5,Star Wars (1977)
2,79,50,4,Star Wars (1977)
3,2,50,5,Star Wars (1977)
4,8,50,5,Star Wars (1977)


In [9]:
df.isna().sum()

user_id    0
item_id    0
rating     0
title      0
dtype: int64

In [10]:
df.groupby("title")["rating"].mean().sort_values()

title
Eye of Vichy, The (Oeil de Vichy, L') (1993)    1.0
Butterfly Kiss (1995)                           1.0
Daens (1992)                                    1.0
JLG/JLG - autoportrait de décembre (1994)       1.0
Touki Bouki (Journey of the Hyena) (1973)       1.0
                                               ... 
Someone Else's America (1995)                   5.0
Star Kid (1997)                                 5.0
Santa with Muscles (1996)                       5.0
Prefontaine (1997)                              5.0
Marlene Dietrich: Shadow and Light (1996)       5.0
Name: rating, Length: 1664, dtype: float64

In [11]:
df["title"].value_counts()

title
Star Wars (1977)                   584
Contact (1997)                     509
Fargo (1996)                       508
Return of the Jedi (1983)          507
Liar Liar (1997)                   485
                                  ... 
Leopard Son, The (1996)              1
Stefano Quantestorie (1993)          1
Quartier Mozart (1992)               1
Reluctant Debutante, The (1958)      1
Dadetown (1995)                      1
Name: count, Length: 1664, dtype: int64

In [12]:
movie_count = pd.DataFrame(df["title"].value_counts().reset_index())

In [13]:
movie_count.head()

Unnamed: 0,title,count
0,Star Wars (1977),584
1,Contact (1997),509
2,Fargo (1996),508
3,Return of the Jedi (1983),507
4,Liar Liar (1997),485


In [14]:
movie_count[movie_count["count"]>=50]["title"]

0                        Star Wars (1977)
1                          Contact (1997)
2                            Fargo (1996)
3               Return of the Jedi (1983)
4                        Liar Liar (1997)
                      ...                
600          American in Paris, An (1951)
601               To Catch a Thief (1955)
602               Perfect World, A (1993)
603              Menace II Society (1993)
604    Once Upon a Time in America (1984)
Name: title, Length: 605, dtype: object

In [15]:
df = df[df["title"].isin(movie_count[movie_count["count"]>=50]["title"]).values]

In [16]:
df["title"].value_counts()

title
Star Wars (1977)                      584
Contact (1997)                        509
Fargo (1996)                          508
Return of the Jedi (1983)             507
Liar Liar (1997)                      485
                                     ... 
Love Bug, The (1969)                   50
To Catch a Thief (1955)                50
Once Upon a Time in America (1984)     50
Trees Lounge (1996)                    50
American in Paris, An (1951)           50
Name: count, Length: 605, dtype: int64

In [17]:
df.groupby("title")["rating"].mean().sort_values()

title
Island of Dr. Moreau, The (1996)                          2.157895
McHale's Navy (1997)                                      2.188406
Striptease (1996)                                         2.238806
Beautician and the Beast, The (1997)                      2.313953
Cable Guy, The (1996)                                     2.339623
                                                            ...   
Wallace & Gromit: The Best of Aardman Animation (1996)    4.447761
Casablanca (1942)                                         4.456790
Wrong Trousers, The (1993)                                4.466102
Schindler's List (1993)                                   4.466443
Close Shave, A (1995)                                     4.491071
Name: rating, Length: 605, dtype: float64

In [18]:
df.head()

Unnamed: 0,user_id,item_id,rating,title
0,0,50,5,Star Wars (1977)
1,290,50,5,Star Wars (1977)
2,79,50,4,Star Wars (1977)
3,2,50,5,Star Wars (1977)
4,8,50,5,Star Wars (1977)


## Content Based Filtering

In [19]:
movie_matrix = pd.pivot_table(df, index="user_id", columns="title", 
                              values="rating", fill_value=0)

In [20]:
movie_matrix

title,101 Dalmatians (1996),12 Angry Men (1957),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),"39 Steps, The (1935)",Absolute Power (1997),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Addams Family Values (1993),...,Willy Wonka and the Chocolate Factory (1971),Wings of Desire (1987),"Wings of the Dove, The (1997)",Winnie the Pooh and the Blustery Day (1968),"Wizard of Oz, The (1939)",Wolf (1994),"Wrong Trousers, The (1993)",Wyatt Earp (1994),Young Frankenstein (1974),Young Guns (1988)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,5.0,0.0,3.0,4.0,0.0,0.0,3.0,3.0,0.0,...,4.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,5.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
movie_corr = movie_matrix.corr(method="pearson")

In [22]:
movie_name = "Star Wars (1977)"

In [23]:
movie_corr[movie_name].sort_values(ascending=False).head(11)

title
Star Wars (1977)                             1.000000
Return of the Jedi (1983)                    0.746797
Empire Strikes Back, The (1980)              0.555233
Raiders of the Lost Ark (1981)               0.540455
Indiana Jones and the Last Crusade (1989)    0.467523
Toy Story (1995)                             0.456219
Terminator, The (1984)                       0.437866
Back to the Future (1985)                    0.419493
Alien (1979)                                 0.412516
Princess Bride, The (1987)                   0.411505
Star Trek: First Contact (1996)              0.410346
Name: Star Wars (1977), dtype: float64

In [24]:
def similar_movies(movies_name):
    return movie_corr[movies_name].sort_values(ascending=False).head(6)

In [25]:
similar_movies("Absolute Power (1997)")

title
Absolute Power (1997)            1.000000
Ransom (1996)                    0.415015
Broken Arrow (1996)              0.404915
Executive Decision (1996)        0.395066
Independence Day (ID4) (1996)    0.369910
Rock, The (1996)                 0.348154
Name: Absolute Power (1997), dtype: float64

In [26]:
df[df["user_id"]==0]

Unnamed: 0,user_id,item_id,rating,title
0,0,50,5,Star Wars (1977)
584,0,172,5,"Empire Strikes Back, The (1980)"
952,0,133,1,Gone with the Wind (1939)


## Collaborative Filtering

In [27]:
user_matrix = pd.pivot_table(df, index="title", columns="user_id", 
                              values="rating", fill_value=0)

In [28]:
user_matrix

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Days in the Valley (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
"20,000 Leagues Under the Sea (1954)",0.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,4.0,0.0,0.0,0.0,4.0,5.0,5.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolf (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
"Wrong Trousers, The (1993)",0.0,5.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wyatt Earp (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Young Frankenstein (1974),0.0,5.0,0.0,0.0,0.0,4.0,4.0,5.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
user_corr = user_matrix.corr(method="pearson")

In [30]:
def similar_user(user_id):
    return user_corr[user_id].sort_values(ascending=False).head(6)

In [31]:
similar_user(34)

user_id
34     1.000000
531    0.511675
173    0.463949
681    0.463243
520    0.410826
732    0.391203
Name: 34, dtype: float64