In [512]:
import pandas as pd
import numpy as np
import nltk
import re
import ssl
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#### Text Tokenization, Lemmatization, and Removing Stopwords

In [513]:
df = pd.read_csv('movie_summary.csv', index_col=0) # importing the dataset

In [514]:
df.head()

Unnamed: 0,movie_name,summary,genre
0,The Shawshank Redemption,"Over the course of several years, two convicts...","['Drama', '']"
1,The Godfather,The aging patriarch of an organized crime dyna...,"['Crime', 'Drama', '']"
2,The Dark Knight,When the menace known as the Joker wreaks havo...,"['Action', 'Crime', 'Drama', '']"
3,The Godfather Part II,The early life and career of Vito Corleone in ...,"['Crime', 'Drama', '']"
4,12 Angry Men,The jury in a New York City murder trial is fr...,"['Crime', 'Drama', '']"


In [515]:
# downloading packages for nltk
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/sagi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sagi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/sagi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [516]:
# implementing text cleaner function
def text_cleaner(text, to_lower=True):
    # Remove all non-alphabetic characters, except '$', and spaces
    clean_text = re.sub(r'[^a-zA-Z\s$]', '', str(text))
    # Convert the text to lowercase
    if to_lower:
        clean_text = clean_text.lower()
    return clean_text.strip()

In [517]:
# clean and tokenize the summary column
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text_cleaner(text)) # tokenization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens] # lemmatization
    clean_tokens = [token for token in lemmatized_tokens if token.isalpha() and token not in stop_words] # cleaning from stopwords
    return ' '.join(clean_tokens)


In [518]:
# preprocessing
df['summary'] = df['summary'].apply(preprocess_text)

# cleaning genre column
df.genre = df.genre.apply(lambda x: text_cleaner(x, to_lower=False))
df.genre = df.genre.apply(lambda x: x.replace(' ', ', '))

In [519]:
df.head()

Unnamed: 0,movie_name,summary,genre
0,The Shawshank Redemption,course several year two convict form friendshi...,Drama
1,The Godfather,aging patriarch organized crime dynasty transf...,"Crime, Drama"
2,The Dark Knight,menace known joker wreaks havoc chaos people g...,"Action, Crime, Drama"
3,The Godfather Part II,early life career vito corleone new york city ...,"Crime, Drama"
4,12 Angry Men,jury new york city murder trial frustrated sin...,"Crime, Drama"


#### TF-IDF Vectorization and Cosine Similarity

In [520]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [521]:
# creating vector matrix
vectorizer = TfidfVectorizer() 
tfidf_matrix = vectorizer.fit_transform(df.summary)

In [522]:
# calculate cosine similarity between documents
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#### Movie Recomendation System

In [523]:
# creating indices from the dataset based on movie_name
indices = pd.Series(df.index, index=df['movie_name']).drop_duplicates() 

def get_recommendations(title): # recomendation function
    # get the index of the movie that matches the title
    idx = indices[title]
    
    # get the pairwsie similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # print the top 10 most similar movies
    print('Recommended movies:')
    print(df[['movie_name', 'genre']].iloc[movie_indices])

In [524]:
get_recommendations("The Dark Knight")

Recommended movies:
                     movie_name                     genre
70        The Dark Knight Rises   Action, Drama, Thriller
129               Batman Begins      Action, Crime, Drama
238       The Battle of Algiers                Drama, War
80                        Joker    Crime, Drama, Thriller
122           L.A. Confidential     Crime, Drama, Mystery
151           Kill Bill: Vol. 1   Action, Crime, Thriller
210                        Room           Drama, Thriller
162              V for Vendetta      Action, Drama, SciFi
28   Terminator 2: Judgment Day  Action, Adventure, SciFi
90          Requiem for a Dream                     Drama
