In [1]:
import pandas as pd 
import numpy as np
import nltk
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df=pd.read_csv('imdb_top_1000.csv')

In [3]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [5]:
df.dtypes

Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [6]:
df.loc[df['Released_Year']=='PG',"Released_Year"]=1995

In [7]:
df['Released_Year']=df['Released_Year'].astype(int)

In [8]:
def movie_era(year) :
    if 1920 <= year <= 1929:
        return "Silent Era"
    elif 1930 <= year <= 1949:
        return "Golden Age"
    elif 1950 <= year <= 1969:
        return "Post-war Era"
    elif 1970 <= year <= 1989:
        return "New Hollywood / Blockbuster Era"
    elif 1990 <= year <= 2009:
        return "Modern Cinema"
    elif 2010 <= year <= 2020:
        return "Contemporary Cinema"
    else:
        return "Year out of range"

In [9]:
df["movie_era"]=df['Released_Year'].apply(movie_era)

In [10]:
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
movie_era          0
dtype: int64

In [11]:
df['Certificate']=df['Certificate'].fillna(df['Certificate'].mode()[0])

In [12]:
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross',
       'movie_era'],
      dtype='object')

In [13]:
x=df['Series_Title']+" "+df['Certificate']+" "+df['Genre']+" "+df['Director']+" "+df['Star1']+" "+df['Star2']+" "+df['movie_era']

In [14]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
from string import punctuation
from nltk.corpus import stopwords
def preprocess(text):
    stop=stopwords.words("english")
    v1="".join(i for i in text.lower() if i not in punctuation)
    v2= " ".join(lemmatizer.lemmatize(i,"v") for i in v1.split())
    return  " ".join(lemmatizer.lemmatize(i,"n") for i in v2.split())

In [15]:
x=x.apply(preprocess)

Vectorization

In [16]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
vectorizer=TfidfVectorizer()
x_vector=vectorizer.fit_transform(x)

In [17]:
x_vector=x_vector.toarray()

In [18]:
movie_dictionary={}
for i,j in zip(df['Series_Title'],x_vector):
    # print(j)
    movie_dictionary[i]=[j]

In [19]:
def recommend(movie,n=5):
    if movie in movie_dictionary:
        movie_vector=movie_dictionary[movie]
        cosine_dict={}
        for i,j in movie_dictionary.items():
            if i!=movie:
                cosine_dict[i]=cosine_similarity(movie_vector,j)[0][0]
        result=sorted(cosine_dict.items(),key=lambda a:a[1],reverse=True)
        return [i[0] for i in result[:n]]
    else:
        return None

In [20]:
# df

In [21]:
recommend("Breakfast at Tiffany's")

['Sabrina', 'Charade', 'Roman Holiday', 'My Fair Lady', 'Wait Until Dark']

In [22]:
df.loc[df["Series_Title"]=='Apocalypse Now',"Poster_Link"].iloc[0]

'https://m.media-amazon.com/images/M/MV5BMDdhODg0MjYtYzBiOS00ZmI5LWEwZGYtZDEyNDU4MmQyNzFkXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_UX67_CR0,0,67,98_AL_.jpg'

In [24]:
data = {
    'movie_dict': movie_dictionary,
    'df': df
}
import pickle
with open("movie_dict.pkl",'wb') as obj1:
    pickle.dump(data,obj1)