# Movie Recommender System using Machine Learning

### Data Collection and preprocessing

In [46]:
# Loading data
import pandas as pd
r_col_names = ['User_id', 'Movie_id', 'Rating']
ratings = pd.read_csv('ml-100k/u.data', sep = '\t', names = r_col_names, usecols = range(3), encoding = "ISO-8859-1")
m_col_names = ['Movie_id', 'Title']
movies = pd.read_csv('ml-100k/u.item', sep = '|', names = m_col_names, usecols = range(2), encoding = "ISO-8859-1")
ratings = pd.merge(movies, ratings)
ratings.head(10)

Unnamed: 0,Movie_id,Title,User_id,Rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3
5,1,Toy Story (1995),5,4
6,1,Toy Story (1995),109,4
7,1,Toy Story (1995),181,3
8,1,Toy Story (1995),95,5
9,1,Toy Story (1995),268,3


In [47]:
# shape of dataset
ratings.shape

(100003, 4)

In [48]:
# describing data
ratings.describe()

Unnamed: 0,Movie_id,User_id,Rating
count,100003.0,100003.0,100003.0
mean,425.520914,462.470876,3.529864
std,330.797791,266.622454,1.125704
min,1.0,0.0,1.0
25%,175.0,254.0,3.0
50%,322.0,447.0,4.0
75%,631.0,682.0,4.0
max,1682.0,943.0,5.0


In [49]:
# dataset information
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100003 entries, 0 to 100002
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Movie_id  100003 non-null  int64 
 1   Title     100003 non-null  object
 2   User_id   100003 non-null  int64 
 3   Rating    100003 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 3.8+ MB


### Prepare User - Rating Matrix

In [50]:
# constructing matrix of users and the movies they rated
userRatings = ratings.pivot_table(index = ['User_id'], columns = ['Title'], values = 'Rating')
userRatings.head()

Title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


### Compute Correlation Score

In [51]:
# computing correlaton score between every pair of movies
correlation_matrix = userRatings.corr()
correlation_matrix.head()

Title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,,-1.0,-0.5,-0.5,0.522233,,-0.426401,,,...,,,,,,,,,,
1-900 (1994),,1.0,,,,,,-0.981981,,,...,,,,-0.944911,,,,,,
101 Dalmatians (1996),-1.0,,1.0,-0.04989,0.269191,0.048973,0.266928,-0.043407,,0.111111,...,,-1.0,,0.15884,0.119234,0.680414,-4.8756e-17,0.707107,,
12 Angry Men (1957),-0.5,,-0.04989,1.0,0.666667,0.256625,0.274772,0.178848,,0.457176,...,,,,0.096546,0.068944,-0.361961,0.1443376,1.0,1.0,
187 (1997),-0.5,,0.269191,0.666667,1.0,0.596644,,-0.5547,,1.0,...,,0.866025,,0.455233,-0.5,0.5,0.4753271,,,


In [52]:
correlation_matrix = userRatings.corr(method = 'pearson', min_periods = 100)
correlation_matrix.head()

Title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,,1.0,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,,,1.0,,,,,,,...,,,,,,,,,,
187 (1997),,,,,,,,,,,...,,,,,,,,,,


### Test Case

In [53]:
testcaseRatings = userRatings.loc[0].dropna()
testcaseRatings

Title
Empire Strikes Back, The (1980)    5.0
Gone with the Wind (1939)          1.0
Star Wars (1977)                   5.0
Name: 0, dtype: float64

### Find Similar Movies

In [80]:
# Calculate movie recommendations based on user similarity and ratings
similarUsers = pd.Series(dtype = 'float64')
for i in range(0, len(testcaseRatings.index)):
    print ("Adding similar movies for " + testcaseRatings.index[i] + "...")
    # Retrieve similar movies to this one that I rated
    similarMovies = correlation_matrix[testcaseRatings.index[i]].dropna()
    # Now scale its similarity by how well I rated this movie
    similarMovies = similarMovies.map(lambda x: x * testcaseRatings[i])
    # Add the score to the list of similarity candidates
    similarUsers = pd.concat([similarUsers, similarMovies])
    
# Sorting similar movies in descending order to get the most similar users firse
print ("Sorting similar movies...")
similarUsers.sort_values(inplace = True, ascending = False)
print (similarUsers.head(10))

Adding similar movies for Empire Strikes Back, The (1980)...
Adding similar movies for Gone with the Wind (1939)...
Adding similar movies for Star Wars (1977)...
Sorting similar movies...
Empire Strikes Back, The (1980)                       5.000000
Star Wars (1977)                                      5.000000
Empire Strikes Back, The (1980)                       3.741763
Star Wars (1977)                                      3.741763
Return of the Jedi (1983)                             3.606146
Return of the Jedi (1983)                             3.362779
Raiders of the Lost Ark (1981)                        2.693297
Raiders of the Lost Ark (1981)                        2.680586
Austin Powers: International Man of Mystery (1997)    1.887164
Sting, The (1973)                                     1.837692
dtype: float64


### Sort and Filter Similar Movies


In [77]:
# Calculating similarity scores
similarUsers = similarUsers.groupby(similarUsers.index).sum()

In [78]:
similarUsers.sort_values(inplace = True, ascending = False)
similarUsers.head(10)

Empire Strikes Back, The (1980)              8.877450
Star Wars (1977)                             8.870971
Return of the Jedi (1983)                    7.178172
Raiders of the Lost Ark (1981)               5.519700
Indiana Jones and the Last Crusade (1989)    3.488028
Bridge on the River Kwai, The (1957)         3.366616
Back to the Future (1985)                    3.357941
Sting, The (1973)                            3.329843
Cinderella (1950)                            3.245412
Field of Dreams (1989)                       3.222311
dtype: float64

In [79]:
# filterig the movies
filteredWatchedMovies = similarUsers.drop(testcaseRatings.index)
filteredWatchedMovies.head(10)

Return of the Jedi (1983)                    7.178172
Raiders of the Lost Ark (1981)               5.519700
Indiana Jones and the Last Crusade (1989)    3.488028
Bridge on the River Kwai, The (1957)         3.366616
Back to the Future (1985)                    3.357941
Sting, The (1973)                            3.329843
Cinderella (1950)                            3.245412
Field of Dreams (1989)                       3.222311
Wizard of Oz, The (1939)                     3.200268
Dumbo (1941)                                 2.981645
dtype: float64

### Display Recommended movies

In [81]:
# Get the recommended movies
recommended_movies = filteredWatchedMovies 
recommended_movies.head(10)

12 Angry Men (1957)             0.921447
2001: A Space Odyssey (1968)    1.867302
Absolute Power (1997)           0.427199
Abyss, The (1989)               2.407877
African Queen, The (1951)       2.310987
Air Force One (1997)            1.393921
Aladdin (1992)                  2.513417
Alien (1979)                    2.195566
Aliens (1986)                   2.791258
Amadeus (1984)                  2.021675
dtype: float64