In [9]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords , wordnet
import re
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('/Users/sarthaksharna/CineMatch/data/cleaned_data.csv')

df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [5]:
stop_words = set(stopwords.words('english'))

Mapping NLTK POS tags to WordNet POS for better lemmatization

In [6]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

In [7]:
def preprocess(text) :
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in nltk.pos_tag(words) if word not in stop_words]
    return ' '.join(words)


df['tags'] = df['tags'].apply(preprocess)



In [8]:
df['tags'][0]

'22nd century paraplegic marine dispatch moon pandora unique mission become torn follow order protect alien civilization action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [10]:
bow = CountVectorizer(max_features = 5000)

vectors = bow.fit_transform(df['tags']).toarray()

In [11]:
vectors[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
vectors.shape

(4806, 5000)

In [24]:
bow.get_feature_names_out().tolist()[0:5]

['007', '10', '100', '10yearold', '11']

In [25]:
similarity = cosine_similarity(vectors)

In [26]:
similarity

array([[1.        , 0.08346223, 0.08346223, ..., 0.04415108, 0.02564946,
        0.        ],
       [0.08346223, 1.        , 0.08823529, ..., 0.046676  , 0.        ,
        0.        ],
       [0.08346223, 0.08823529, 1.        , ..., 0.046676  , 0.        ,
        0.        ],
       ...,
       [0.04415108, 0.046676  , 0.046676  , ..., 1.        , 0.04303315,
        0.02028602],
       [0.02564946, 0.        , 0.        , ..., 0.04303315, 1.        ,
        0.02357023],
       [0.        , 0.        , 0.        , ..., 0.02028602, 0.02357023,
        1.        ]])

In [27]:
similarity.shape

(4806, 4806)

In [43]:
def recommend_top_5(movie_name):
    movie_name = movie_name.lower()

    if movie_name not in df['title'].str.lower().values:
        return "Movie not found", []
    
    index = df[df['title'].str.lower() == movie_name].index[0]
    distances = similarity[index]

    movies_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    
    recommended_movies = [(i[0], df['title'].iloc[i[0]]) for i in movies_list]
    return "Success", recommended_movies




In [44]:
recommend_top_5("Toy story")

('Success',
 [(343, 'Toy Story 2'),
  (42, 'Toy Story 3'),
  (2870, 'For Your Consideration'),
  (744, 'The Lego Movie'),
  (4258, 'Growing Up Smith')])

In [45]:
import pickle

pickle.dump(df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
