In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import jaccard_score

import pickle
import scipy

In [2]:
data=pd.read_excel("movie_data.xlsx")
data.info()
data.rename(columns={'Unnamed: 0': 'movie_id'}, inplace=True)
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Year of Release     999 non-null    object 
 3   Watch Time          952 non-null    object 
 4   Genre               1000 non-null   object 
 5   Movie Rating        943 non-null    float64
 6   Metascore of movie  909 non-null    float64
 7   Director            1000 non-null   object 
 8   Cast                1000 non-null   object 
 9   Votes               943 non-null    object 
 10  Description         1000 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 86.1+ KB


Unnamed: 0,movie_id,Title,Year of Release,Watch Time,Genre,Movie Rating,Metascore of movie,Director,Cast,Votes,Description
0,0,No Time to Die,2021,163 min,"\nAction, Adventure, Thriller",7.7,70.0,Cary Joji Fukunaga,"Daniel Craig,Ana de Armas,Rami Malek,Léa Seydoux",35153,James Bond has left active service. His peace ...
1,1,Dune,2021,155 min,"\nAction, Adventure, Drama",8.4,76.0,Denis Villeneuve,"Timothée Chalamet,Rebecca Ferguson,Zendaya,Osc...",63210,Feature adaptation of Frank Herbert's science ...
2,2,Free Guy,2021,115 min,"\nAction, Adventure, Comedy",7.3,62.0,Shawn Levy,"Ryan Reynolds,Jodie Comer,Taika Waititi,Lil Re...",117171,A bank teller discovers that he's actually an ...
3,3,The Many Saints of Newark,2021,120 min,"\nCrime, Drama",6.6,61.0,Alan Taylor,"Alessandro Nivola,Leslie Odom Jr.,Jon Bernthal...",15173,A look at the formative years of New Jersey ga...
4,4,Venom: Let There Be Carnage,2021,97 min,"\nAction, Adventure, Sci-Fi",6.6,47.0,Andy Serkis,"Tom Hardy,Woody Harrelson,Michelle Williams,Na...",10107,Eddie Brock attempts to reignite his career by...


In [3]:
columns=['Cast','Director','Genre','Title','Description']
columns2=['Cast','Director','Genre','Title', 'Movie Rating','Description']
columns3=['Cast','Director','Genre','Title','Metascore of movie','Description']

In [4]:
data[columns].isnull().values.any()#no null values

False

In [5]:
def get_important_features(data):
    important_features=[]
    for i in range (0,data.shape[0]):
        important_features.append(data['Title'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Description'][i])
    return important_features

In [6]:
#creating a column to hold the combined strings
data['important_features']=get_important_features(data)

tfidf = TfidfVectorizer(stop_words='english')
feature_matrix = tfidf.fit_transform(data['important_features'])
feature_matrix.shape

(1000, 6709)

In [7]:
# Movie Similarity Checker
cosine_sim = linear_kernel(feature_matrix, feature_matrix)

In [8]:
indices = pd.Series(data.index, index=data['Title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar movies
    movies=data['Title'].iloc[movie_indices]
    id=data['movie_id'].iloc[movie_indices]
    dict={"Movies":movies,"id":id}
    final_df=pd.DataFrame(dict)
    final_df.reset_index(drop=True,inplace=True)
    return final_df

In [9]:
# Test Movie Recommendation - Cosine
get_recommendations('Venom: Let There Be Carnage')

Unnamed: 0,Movies,id
0,Venom,20
1,The Silence of the Lambs,196
2,Escape from New York,919
3,Birdman or (The Unexpected Virtue of Ignorance),892
4,Let Him Go,450


In [10]:
# Test Movie Recommendation - Cosine
get_recommendations('Spider-Man: Far from Home')

Unnamed: 0,Movies,id
0,Spider-Man: No Way Home,36
1,Spider-Man: Homecoming,227
2,Spider-Man: Into the Spider-Verse,233
3,The Amazing Spider-Man 2,500
4,Spider-Man,138


In [11]:
# Test Movie Recommendation - Cosine
get_recommendations('Reservoir Dogs')

Unnamed: 0,Movies,id
0,No Sudden Move,346
1,The Descent,783
2,Kill Bill: Vol. 1,440
3,The Hateful Eight,433
4,Pulp Fiction,112


In [12]:
data.info()
new = data.drop(columns=['Year of Release','Watch Time','Genre','Movie Rating','Metascore of movie','Director','Cast','Votes','Description'])

pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Year of Release     999 non-null    object 
 3   Watch Time          952 non-null    object 
 4   Genre               1000 non-null   object 
 5   Movie Rating        943 non-null    float64
 6   Metascore of movie  909 non-null    float64
 7   Director            1000 non-null   object 
 8   Cast                1000 non-null   object 
 9   Votes               943 non-null    object 
 10  Description         1000 non-null   object 
 11  important_features  1000 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


In [13]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-0.89.0-py2.py3-none-any.whl (8.3 MB)
Collecting validators
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting protobuf!=3.11,>=3.6.0
  Downloading protobuf-3.18.0-cp38-cp38-win_amd64.whl (912 kB)
Collecting altair>=3.2.0
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
Collecting base58
  Downloading base58-2.1.0-py3-none-any.whl (5.6 kB)
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.7.0-py2.py3-none-any.whl (4.3 MB)
Collecting cachetools>=4.0
  Downloading cachetools-4.2.4-py3-none-any.whl (10 kB)
Collecting blinker
  Downloading blinker-1.4.tar.gz (111 kB)
Collecting astor
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting tzlocal
  Downloading tzlocal-3.0-py3-none-any.whl (16 kB)
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
Collecting pyarrow
  Downloading pyarrow-5.0.0-cp38-cp38-win_amd64.whl (14.5 MB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.7-py

Installing collected packages: smmap, tzdata, gitdb, backports.zoneinfo, validators, tzlocal, pydeck, pyarrow, protobuf, gitpython, cachetools, blinker, base58, astor, altair, streamlit
Successfully installed altair-4.1.0 astor-0.8.1 backports.zoneinfo-0.2.1 base58-2.1.0 blinker-1.4 cachetools-4.2.4 gitdb-4.0.7 gitpython-3.1.24 protobuf-3.18.0 pyarrow-5.0.0 pydeck-0.7.0 smmap-4.0.0 streamlit-0.89.0 tzdata-2021.2.post0 tzlocal-3.0 validators-0.18.2
