In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as tf_idf
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [4]:
movie = pd.read_excel('MOVIE.xlsx')


In [5]:
genre = pd.read_excel('GENRE.xlsx')
movie_genre = pd.read_excel('MOVIE_GENRE.xlsx')

In [6]:
genre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       19 non-null     int64 
 1   GENREID  19 non-null     int64 
 2   GENRE    19 non-null     object
dtypes: int64(2), object(1)
memory usage: 588.0+ bytes


In [7]:
movie.columns

Index(['FILMID', 'TITLE', 'BUDGET', 'ORIGINAL_LANGUAGE', 'ORIGINAL_TITLE',
       'OVERVIEW', 'RELEASE_DATE', 'REVENUE', 'RUNTIME', 'VOTE_AVERAGE',
       'VOTE_COUNT', 'STATUS_', 'TAGLINE'],
      dtype='object')

In [8]:
genre = genre.merge(movie_genre, on = 'GENREID', how = 'inner').drop(['ID_x', 'ID_y'], axis = 1)

In [9]:
genre['GENRE'] = genre['GENRE'].apply(lambda x: x+ ' ')

In [10]:
genre = genre.groupby('FILMID')[['GENRE']].sum().reset_index()

In [11]:
movie = genre.merge(movie, on = 'FILMID', how = 'inner')

In [12]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138932 entries, 0 to 138931
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   FILMID             138932 non-null  int64  
 1   GENRE              138932 non-null  object 
 2   TITLE              138932 non-null  object 
 3   BUDGET             138932 non-null  int64  
 4   ORIGINAL_LANGUAGE  138932 non-null  object 
 5   ORIGINAL_TITLE     138932 non-null  object 
 6   OVERVIEW           119344 non-null  object 
 7   RELEASE_DATE       138932 non-null  object 
 8   REVENUE            138932 non-null  int64  
 9   RUNTIME            129399 non-null  float64
 10  VOTE_AVERAGE       138932 non-null  float64
 11  VOTE_COUNT         138932 non-null  int64  
 12  STATUS_            138932 non-null  object 
 13  TAGLINE            37897 non-null   object 
dtypes: float64(2), int64(4), object(8)
memory usage: 14.8+ MB


In [13]:
movie.isna().sum()

FILMID                    0
GENRE                     0
TITLE                     0
BUDGET                    0
ORIGINAL_LANGUAGE         0
ORIGINAL_TITLE            0
OVERVIEW              19588
RELEASE_DATE              0
REVENUE                   0
RUNTIME                9533
VOTE_AVERAGE              0
VOTE_COUNT                0
STATUS_                   0
TAGLINE              101035
dtype: int64

In [14]:
movie = movie.iloc[movie['OVERVIEW'].dropna().index]

In [15]:
movie = movie.fillna(' ')

In [16]:
movie['RELEASE_DATE'] = pd.to_datetime(movie['RELEASE_DATE'], format='%Y-%m-%d')

In [17]:
movie = movie.sort_values(by=['RELEASE_DATE'], ascending=False)

In [None]:
#replace unknown plot with empty string
movie['OVERVIEW'] = movie['OVERVIEW'].apply(lambda x: x.lower().replace('plot unknown', '')).apply(lambda x: np.nan if x == '' else x)



In [19]:
movie.dropna(inplace=True)

In [21]:
movie['Tags'] = movie['GENRE'] + ' '  + movie['OVERVIEW'] + ' ' + movie['TAGLINE'] + ' ' + movie['TITLE']

In [22]:
movie['Tags'] = movie['Tags'].str.lower()

In [23]:
movie.reset_index(inplace=True)

In [26]:
#work with first 10000 movies due to memory space
movie = movie[:10000]

In [27]:
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)

    def get_wordnet_pos(tag):
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag[0].upper(), wordnet.NOUN)

    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags])


In [28]:
movie['Tags_'] = movie['Tags'].apply(lemmatize_sentence) 

In [29]:
vectorizer = tf_idf(max_features=7000, stop_words='english', lowercase=True ) 

In [30]:
vectors = vectorizer.fit_transform(movie['Tags_']).toarray() 

In [31]:
vectors.shape

(10000, 7000)

In [33]:
similarity = cosine_similarity(vectors) 

In [34]:
similarity.shape

(10000, 10000)

In [None]:
def recommendation(title, data): 
    try:
        movie_index = data[data['TITLE'] == title].index[0] 
    except:
        return "Movie not currently in the database"
    distances = similarity[movie_index] 
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:10]
    for i in movies_list:
        print(data.iloc[i[0]].TITLE) 

In [40]:
movie['ORIGINAL_TITLE'][:100].values

array(['McClane', 'Machete Kills Again... in Space',
       'Shazam! Fury of the Gods', 'Letters from Rosemary',
       'Enzo Ferrari', 'Guardians of the Galaxy Vol. 3', 'Before Memory',
       'National Treasure 3', 'Live Die Repeat and Repeat',
       'Untitled The Adventures of Tintin Sequel', 'Avatar 2',
       'Aquaman 2', 'The Flash', 'Halloween Ends', 'Seuss',
       'Spider-Man: Into the Spider-Verse Sequel', 'Shrek 5',
       'Indiana Jones 5', 'Captain Marvel 2', 'Jurassic World: Dominion',
       'Legally Blonde 3', 'Black Panther II', 'Fast & Furious 10',
       'Doctor Strange in the Multiverse of Madness',
       'The Angry Birds Movie 3: Tropi-Collapse!', 'The Batman',
       'Thor: Love and Thunder', 'The LEGO Batman Movie 2',
       'Sesame Street', 'Scream 5', 'Shrunk', '콘크리트 유토피아',
       'To All the Boys: Always and Forever, Lara Jean',
       'Beverly Hills Cop 4', 'Untitled The Nun Sequel', 'Andorra',
       'Ladybug & Cat Noir Awakening', 'Wanted 2', 'Matin Calme

In [37]:
recommendation("Tangled", data = movie)

'Movie not currently in the database'

In [41]:
recommendation("Spider-Man: Into the Spider-Verse Sequel", data = movie)

Spider-Man: Into the Spider-Verse
Lego Spider-Man Series
A Fan's Guide to Spider-Man: Homecoming
RiffTrax Live: Giant Spider Invasion
LEGO Marvel Spider-Man: Vexed By Venom
Spider-Man: Homecoming
Paper Spiders
Spider in the Web
Itsy Bitsy


In [42]:
movie.to_csv('data.csv')

In [43]:
pickle.dump(similarity,open('similarity.pkl','wb'))