# Building a Recommendation System for Movies

Importing the libraries which are required for the project

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

Loading the dataset and printing the shape of the dataset as well as printing the dataset.

In [None]:
df = pd.read_csv('title.basics.tsv', sep='\t')
print(df.shape)
df

(130064, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0030298,tvMovie,Julius Caesar,Julius Caesar,0,1938,\N,101,"Drama,History"
1,tt0038309,tvMovie,As You Like It,As You Like It,0,1946,\N,\N,Drama
2,tt0038738,tvMovie,A Midsummer Night's Dream,A Midsummer Night's Dream,0,1946,\N,150,Drama
3,tt0039445,tvMovie,Hamlet Part 1,Hamlet Part 1,0,1947,\N,88,Drama
4,tt0039618,tvMovie,The Merchant of Venice,The Merchant of Venice,0,1947,\N,90,Drama
...,...,...,...,...,...,...,...,...,...
130059,tt9916192,tvMovie,Danielle Darrieux: Il est poli d'être gai!,Danielle Darrieux: Il est poli d'être gai!,0,2019,\N,53,Biography
130060,tt9916248,tvMovie,Mahiwaga,Mahiwaga,0,1996,\N,\N,"Action,Horror,Thriller"
130061,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0,2019,\N,\N,Comedy
130062,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family


Using info method to see is any null value present or not, datatype of each column. Statically all are categorical

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130064 entries, 0 to 130063
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          130064 non-null  object
 1   titleType       130064 non-null  object
 2   primaryTitle    130064 non-null  object
 3   originalTitle   130064 non-null  object
 4   isAdult         130064 non-null  int64 
 5   startYear       130064 non-null  object
 6   endYear         130064 non-null  object
 7   runtimeMinutes  130064 non-null  object
 8   genres          130064 non-null  object
dtypes: int64(1), object(8)
memory usage: 8.9+ MB


Important columns for Recommdataion

In [None]:
column = ['genres', 'originalTitle', 'primaryTitle', 'isAdult']

Splitting the dataset into training data and testing data

In testing data we are including the Movie_id to identify each row which works as index

In [None]:
train, test = train_test_split(df, test_size=0.1, shuffle = False)
test['Movie_id'] = range(0, test.shape[0])
test = test.set_index('Movie_id')
test.shape

(13007, 9)

Printing the testing data

In [None]:
data = test[column]

Checking if any rows contain null values in any column so we are dropping those column

In [None]:
if data[column].isnull().values.any():
    test = data.dropna()
data[column].isnull().values.any()

False

Combining the important feature in a single column we are doing this to reduce the processing time

In [None]:
# data['Combine'] = data['genres']+" "+data['originalTitle']+' '+data['primaryTitle']+" "+data['isAdult'].map(str)+" "+data['startYear']

In [None]:
def get_important_features(data):
    important_feature = []
    for index, row in data.iterrows():
        new_column = re.sub('[^A-Za-z0-9]+', ' ',row['genres'])\
        +' '+re.sub('[^A-Za-z0-9]+', ' ',row['originalTitle'])\
        +' '+re.sub('[^A-Za-z0-9]+', ' ',row['primaryTitle'])\
        +' '+str(row['isAdult'])
        important_feature.append(new_column)
    return important_feature

Adding all the merged columns and naming it as Important features

In [None]:
test['Important Features'] = get_important_features(test)

In [None]:
test

Unnamed: 0_level_0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,Important Features
Movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,tt6908630,tvMovie,Broadcast Battleground,Broadcast Battleground,0,2014,\N,\N,Comedy,Comedy Broadcast Battleground Broadcast Battle...
1,tt6908858,tvMovie,Der Islamreport,Der Islamreport,0,2016,\N,45,Documentary,Documentary Der Islamreport Der Islamreport 0
2,tt6909136,tvMovie,EquinoXe,EquinoXe,1,2017,\N,75,Adult,Adult EquinoXe EquinoXe 1
3,tt6909272,tvMovie,The Sheriff and the Astronaut,The Sheriff and the Astronaut,0,1984,\N,\N,Action,Action The Sheriff and the Astronaut The Sheri...
4,tt6909598,tvMovie,Moonys Kindchen weint nicht,Moonys Kindchen weint nicht,0,1961,\N,30,\N,N Moonys Kindchen weint nicht Moonys Kindchen...
...,...,...,...,...,...,...,...,...,...,...
13002,tt9916192,tvMovie,Danielle Darrieux: Il est poli d'être gai!,Danielle Darrieux: Il est poli d'être gai!,0,2019,\N,53,Biography,Biography Danielle Darrieux Il est poli d tre ...
13003,tt9916248,tvMovie,Mahiwaga,Mahiwaga,0,1996,\N,\N,"Action,Horror,Thriller",Action Horror Thriller Mahiwaga Mahiwaga 0
13004,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0,2019,\N,\N,Comedy,Comedy Pink Taxi Pink Taxi 0
13005,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family,Family March of Dimes Presents Once Upon a Dim...


Converting the Important feature making their tokens

In [None]:
cm = CountVectorizer().fit_transform(test['Important Features'])
print(cm)

Making the cosine similarity matrix for Recommendation purpose

In [None]:
cs = cosine_similarity(cm)
print(cs, cs.shape)

[[1.         0.         0.         ... 0.11111111 0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.11111111 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]] (13007, 13007)


Taking a sample movie name and finding its movie id

In [None]:
test_movie = 'Tartuffe'
movie_id = test[test.originalTitle == test_movie].index.values[0]
movie_id

8

cs is an array so we will enumerate function to iterate and also index value

In [None]:
scores = list(enumerate(cs[movie_id]))
scores

So the movie id which is taken for sample purpose finding its cosine_similarity row and sorting the values in reverse order to get, which movie should be recommended

In [None]:
sorted_score = sorted(scores, key=lambda x:x[1], reverse=True)
sorted_score[1:]

Searching the top 5 movies similar to the sample movie which has taken and printing the top 5 similary movies

In [None]:
j=0
for item in sorted_score[1:]:
    movie_title = test[test.index == item[0]]['originalTitle'].values[0]
    print(j+1, movie_title)
    j+=1
    if j>=5:
        break

1 Tartuffe
2 J.S.I.D.
3 A bünös
4 D.A.Z.E.D
5 C.Q.


In [None]:
# create a function that take name of the movie from the user generate 5 recommendation based on that
# note that name of the movie should be present in the data on which the model is trained.

create a function that take name of the movie from the user generate 5 recommendation based on that.


Note that name of the movie should be present in the data on which the model is trained.

In [None]:
def recommend_movie(movie):
    try:
        movie_id = test[test.originalTitle == movie].index.values[0]
        scores = list(enumerate(cs[movie_id]))
        sorted_score = sorted(scores, key=lambda x:x[1], reverse=True)
        print("\nTop 5 similar movies to ", movie,' are: ')
        j=0
        for item in sorted_score[1:]:
            movie_title = test[test.index == item[0]]['originalTitle'].values[0]
            print(j+1, movie_title)
            j+=1
            if j>=5:
                break
    except:
        print("Sorry unrecognised movie, Try Again")

In [None]:
while True:
    print("\nPress 1 for movie similar movie\nAny other number to exit\n")
    choice = int(input("Enter your choice: "))
    if choice == 1:
        movie_name = input("Enter the movie: ")
        recommend_movie(movie_name)
    else:
        break


Press 1 for movie similar movie
Any other number to exit

Enter your choice: 3
