In [51]:
# import the useful libraries
import pandas as pd
import numpy as np

In [2]:
# read the movies csv file 
movies_df = pd.read_csv("movies.csv")

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies_df.shape

(9742, 3)

In [5]:
movies_df.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [6]:
movies_df = pd.read_csv("movies.csv",usecols=["movieId","title"],dtype={"movieId":"int64","title":"str"})

In [7]:
movies_df.dtypes

movieId     int64
title      object
dtype: object

In [8]:
# read the ratings csv file 
ratings_df = pd.read_csv("ratings.csv")

In [9]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
ratings_df.shape

(100836, 4)

In [11]:
ratings_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [12]:
ratings_df = pd.read_csv("ratings.csv",usecols=["userId","movieId","rating"])

In [13]:
ratings_df.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [14]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [15]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### As in both the above tables i.e in movies and ratings df , movieId is common, SO we can merge both the df , so that it will be useful for us for further processing.

In [16]:
df = pd.merge(ratings_df , movies_df , on="movieId")
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


### Now we have to count , for each movie how many people gave the ratings . So simply we have to count the number of ratings given to each movie

In [17]:
df.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

In [18]:
combine_movies_ratings = df.copy()

In [20]:
combine_movies_ratings.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [22]:
movie_rating_cnt = combine_movies_ratings.groupby(by=["title"])["rating"].count().reset_index().rename(columns={"rating":"total_rating_cnt"})[["title","total_rating_cnt"]]

In [24]:
movie_rating_cnt

Unnamed: 0,title,total_rating_cnt
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [26]:
rating_with_totalratingcnt = combine_movies_ratings.merge(movie_rating_cnt , left_on="title" , right_on="title" , how="left")

In [27]:
rating_with_totalratingcnt

Unnamed: 0,userId,movieId,rating,title,total_rating_cnt
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),1
100832,610,160527,4.5,Sympathy for the Underdog (1971),1
100833,610,160836,3.0,Hazard (2005),1
100834,610,163937,3.5,Blair Witch (2016),1


### Now we have to take only those movies into consideration who have more than 50 rating count.

In [30]:
popularity_threshold = 50
rating_popular_movie = rating_with_totalratingcnt.query("total_rating_cnt >= @popularity_threshold")

In [31]:
rating_popular_movie

Unnamed: 0,userId,movieId,rating,title,total_rating_cnt
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
...,...,...,...,...,...
79246,603,1997,4.0,"Exorcist, The (1973)",53
79247,606,1997,3.0,"Exorcist, The (1973)",53
79248,607,1997,5.0,"Exorcist, The (1973)",53
79249,608,1997,4.5,"Exorcist, The (1973)",53


In [34]:
rating_popular_movie.isnull().sum()

userId              0
movieId             0
rating              0
title               0
total_rating_cnt    0
dtype: int64

### Now we have to look upon how much rating each user has given to a movie
### So to do that we have to create pivot table

In [44]:
movie_features_df = rating_popular_movie.pivot_table(values="rating", index="title", columns="userId").fillna(0)
movie_features_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0
You've Got Mail (1998),0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Young Frankenstein (1974),5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Zombieland (2009),0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


### As above df is having most of 0's , So to store this sparse matrix we need something which is memory and time efficient
### We can use np.array but it is not efficient , So we are using csr_matrix

In [45]:
from scipy.sparse import csr_matrix

In [46]:
movie_features_df.values

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 3. , 0. , 4.5],
       ...,
       [5. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 3. , 0. , ..., 0. , 0. , 3.5],
       [0. , 0. , 0. , ..., 3. , 0. , 4. ]])

In [47]:
movie_features_df_matrix = csr_matrix(movie_features_df.values)

In [48]:
from sklearn.neighbors import NearestNeighbors

In [49]:
# Unsupervised learner for implementing neighbor searches.
model_knn = NearestNeighbors(metric="cosine")
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(metric='cosine')

In [43]:
movie_features_df.shape

(450, 606)

In [66]:
query_index = np.random.choice(movie_features_df.shape[0]) # pciking 1 datapoint randomly
print(query_index)

# Finds the K-neighbors of a point.
# Returns indices of and distances to the neighbors of each point.
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 11) 

3


In [68]:
for i in range(0,len(distances.flatten())):
    if i == 0:
        print("Top 10 Recommendation for movie {0} is :\n".format(movie_features_df.index[query_index]))
    else:
        print("{0}:{1} with distance of {2}".format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Top 10 Recommendation for movie 28 Days Later (2002) is :

1:Children of Men (2006) with distance of 0.4474856392156281
2:Donnie Darko (2001) with distance of 0.4518045700464953
3:Shaun of the Dead (2004) with distance of 0.4701086266831487
4:Scarface (1983) with distance of 0.5088632188782307
5:War of the Worlds (2005) with distance of 0.524382829710049
6:Kill Bill: Vol. 2 (2004) with distance of 0.5391124538408185
7:Day After Tomorrow, The (2004) with distance of 0.5393190752009787
8:I Am Legend (2007) with distance of 0.5433601832677994
9:Prestige, The (2006) with distance of 0.5433819533057795
10:Signs (2002) with distance of 0.5437593544042385
