In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [6]:
df = pd.read_csv('../data/movies_w_tag.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview,genres,tagline,vote_average,popularity,tags
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,,7.7,21.946943,"Led by Woody, Andy's toys live happily in his ..."
1,1,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Roll the dice and unleash the excitement!,6.9,17.015539,When siblings Judy and Peter discover an encha...
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Still Yelling. Still Fighting. Still Ready for...,6.5,11.7129,A family wedding reignites the ancient feud be...
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Comedy Drama Romance,Friends are the people who let you be yourself...,6.1,3.859495,"Cheated on, mistreated and stepped on, the wom..."
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Just When His World Is Back To Normal... He's ...,5.7,8.387519,Just when George Banks has recovered from his ...


In [8]:
df['tags'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Animation Comedy Family "

## Removing puntuation

In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [11]:
df['tags'] = df['tags'].apply(preprocess_text)

In [12]:
df['tags'][0]

'led woody andys toy live happily room andys birthday brings buzz lightyear onto scene afraid losing place andys heart woody plot buzz circumstance separate buzz woody owner duo eventually learns put aside difference animation comedy family'

## Convert to vectors

In [13]:
df= df.reset_index(drop= True)

In [14]:
indices = pd.Series(df.index, index = df['title']).drop_duplicates()

In [15]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45442
Century of Birthing            45443
Betrayal                       45444
Satan Triumphant               45445
Queerama                       45446
Length: 45447, dtype: int64

In [16]:
tfidf= TfidfVectorizer(max_features = 50000,ngram_range=(1,2), stop_words = 'english')

In [17]:
tfidf_matrix = tfidf.fit_transform(df['tags'])

In [18]:
tfidf_matrix.shape

(45447, 50000)

In [19]:
def recommend(title, n =10):
    if title not in indices:
        return ['Movies not found']

    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [20]:
recommend('Toy Story')

2996                Toy Story 2
15344               Toy Story 3
24512                 Small Fry
28972           Superstar Goofy
17184                 Group Sex
6434     What's Up, Tiger Lily?
11396    For Your Consideration
1071      Rebel Without a Cause
1931                  Condorman
485                      Malice
Name: title, dtype: object

In [22]:
pickle.dump(tfidf_matrix, open('../pickle/tdidf_matrix.pkl', 'wb'))
pickle.dump(indices, open('../pickle/indices.pkl','wb'))

In [23]:
df.to_pickle('df.pkl')

In [24]:
pickle.dump(tfidf, open('../pickle/tfidf.pkl', 'wb'))