In [133]:
import numpy as np  #import the required packages
import pandas as pd 

In [134]:
movies = pd.read_csv('movies.csv') #read the csv file for data

In [135]:
movies.head() #display the contents of the file

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [136]:
movies.dropna(inplace=True) #remove the movies with any null field

In [137]:
movies = movies[['rating','genre','name','director','writer','star','country']] #extract the columns that are required for forming the tag for each movie

In [138]:
movies.head() #display the modified data set

Unnamed: 0,rating,genre,name,director,writer,star,country
0,R,Drama,The Shining,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom
1,R,Adventure,The Blue Lagoon,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States
2,PG,Action,Star Wars: Episode V - The Empire Strikes Back,Irvin Kershner,Leigh Brackett,Mark Hamill,United States
3,PG,Comedy,Airplane!,Jim Abrahams,Jim Abrahams,Robert Hays,United States
4,R,Comedy,Caddyshack,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States


In [140]:
movies['writer'] = movies['writer'].str.replace(' ', '')  #remove the spaces between words of writer, director, star and country columns to avoid discrepencies.
movies['director'] = movies['director'].str.replace(' ', '')
movies['star'] = movies['star'].str.replace(' ', '')
movies['country'] = movies['country'].str.replace(' ', '')

In [141]:
movies['tag'] = movies['rating'].map(str) +' '+ movies['genre'].map(str)+' ' + movies['director'].map(str)+' '+movies['writer'].map(str)+' '+movies['star'].map(str)+' ' +movies['country'].map(str)
#creating a tag column for each movie by concating the rating, genre, writer, director and star columns

In [142]:
movies.head() #display the modified dataset

Unnamed: 0,rating,genre,name,director,writer,star,country,tag
0,R,Drama,The Shining,StanleyKubrick,StephenKing,JackNicholson,UnitedKingdom,R Drama StanleyKubrick StephenKing JackNichols...
1,R,Adventure,The Blue Lagoon,RandalKleiser,HenryDeVereStacpoole,BrookeShields,UnitedStates,R Adventure RandalKleiser HenryDeVereStacpoole...
2,PG,Action,Star Wars: Episode V - The Empire Strikes Back,IrvinKershner,LeighBrackett,MarkHamill,UnitedStates,PG Action IrvinKershner LeighBrackett MarkHami...
3,PG,Comedy,Airplane!,JimAbrahams,JimAbrahams,RobertHays,UnitedStates,PG Comedy JimAbrahams JimAbrahams RobertHays U...
4,R,Comedy,Caddyshack,HaroldRamis,BrianDoyle-Murray,ChevyChase,UnitedStates,R Comedy HaroldRamis BrianDoyle-Murray ChevyCh...


In [143]:
movies=movies.drop(columns=['writer','director','star','genre','rating','country']) 
#remove the not required columns

In [145]:
movies.head() #display modified dataset

Unnamed: 0,name,tag
0,The Shining,R Drama StanleyKubrick StephenKing JackNichols...
1,The Blue Lagoon,R Adventure RandalKleiser HenryDeVereStacpoole...
2,Star Wars: Episode V - The Empire Strikes Back,PG Action IrvinKershner LeighBrackett MarkHami...
3,Airplane!,PG Comedy JimAbrahams JimAbrahams RobertHays U...
4,Caddyshack,R Comedy HaroldRamis BrianDoyle-Murray ChevyCh...


In [146]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000,stop_words='english')
# import the text extraction feature from countVectorizer
# cv will take up 1000 most frequently occuring words from the data excluding the stop words of english laguages.
# stop words are --> is, to ,are etc

In [147]:
vector = cv.fit_transform(movies['tag']).toarray() #converts the data into points.

In [148]:
vector.shape

(5421, 1000)

In [149]:
from sklearn.metrics.pairwise import cosine_similarity
# import the cosine_similarity module

In [150]:
similarity = cosine_similarity(vector) 
#find the cosine distance of each vector with other vectors and store it in similarity variable.

In [151]:
similarity #display the similarity list

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.40824829, ..., 0.5       , 0.35355339,
        0.        ],
       [0.        , 0.40824829, 1.        , ..., 0.40824829, 0.28867513,
        0.33333333],
       ...,
       [0.        , 0.5       , 0.40824829, ..., 1.        , 0.35355339,
        0.        ],
       [0.        , 0.35355339, 0.28867513, ..., 0.35355339, 1.        ,
        0.        ],
       [0.        , 0.        , 0.33333333, ..., 0.        , 0.        ,
        1.        ]])

In [156]:
movies.head()

Unnamed: 0,name,tag
0,The Shining,R Drama StanleyKubrick StephenKing JackNichols...
1,The Blue Lagoon,R Adventure RandalKleiser HenryDeVereStacpoole...
2,Star Wars: Episode V - The Empire Strikes Back,PG Action IrvinKershner LeighBrackett MarkHami...
3,Airplane!,PG Comedy JimAbrahams JimAbrahams RobertHays U...
4,Caddyshack,R Comedy HaroldRamis BrianDoyle-Murray ChevyCh...


In [157]:
movies['title']=movies['name'] #create a column to store the movie name as title in the dataset for further task.

In [158]:
movies.head()

Unnamed: 0,name,tag,title
0,The Shining,R Drama StanleyKubrick StephenKing JackNichols...,The Shining
1,The Blue Lagoon,R Adventure RandalKleiser HenryDeVereStacpoole...,The Blue Lagoon
2,Star Wars: Episode V - The Empire Strikes Back,PG Action IrvinKershner LeighBrackett MarkHami...,Star Wars: Episode V - The Empire Strikes Back
3,Airplane!,PG Comedy JimAbrahams JimAbrahams RobertHays U...,Airplane!
4,Caddyshack,R Comedy HaroldRamis BrianDoyle-Murray ChevyCh...,Caddyshack


In [159]:
# this function does the recommendation task.
# the passed movie's index is found in the dataframe
# the similarity vector for that particular movie is found using the fetched index vallue.
# enumerate function has been used to keep the tuple of index and cosine distance value.
# the list is sorted in descending order because the most similar movies will be valued greater (cos 0 is 1)
# the loop is used to print the values of distances list
# i[0] will have the movie index, of which the title is being printed

def recommend(movie):
    index = movies[movies['name'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(movies.iloc[i[0]].title)

In [160]:
recommend('The Shining') #a test data to test the function, top 5 closest movies to the movie 'The Shining' is displayed.

Track 29
Apartment Zero
The Secret Garden
Angels and Insects
Jude


In [161]:
import pickle 

In [162]:
# to store it in a file/database, and use it in creating the streamlit app

pickle.dump(movies.to_dict(),open('movie_dict.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))