# Movie recommandation system

## Exploring Dataset

In [2]:
import pandas as pd

df = pd.read_csv("../dataset/movies.csv")
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...",2010-07-15,8.368,35811,"Action, Science Fiction, Adventure","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan
1,157336,Interstellar,The adventures of a group of explorers who mak...,2014-11-05,8.434,34465,"Adventure, Drama, Science Fiction","Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan
2,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,2008-07-16,8.515,32012,"Drama, Action, Crime, Thriller","Christian Bale, Heath Ledger, Michael Caine, G...",Christopher Nolan
3,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-15,7.581,30907,"Action, Adventure, Fantasy, Science Fiction","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron
4,24428,The Avengers,When an unexpected enemy emerges and threatens...,2012-04-25,7.714,30090,"Science Fiction, Action, Adventure","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon


In [3]:
df.shape

(10000, 9)

In [4]:
df.columns

Index(['id', 'title', 'description', 'release_date', 'rating', 'vote_count',
       'genres', 'actors', 'director'],
      dtype='object')

# Preprocessing

In [5]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download required data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Setup preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def clean_text(text):
    """Complete text preprocessing pipeline"""
    text = text.lower().strip()
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to /home/sudarshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df['description'] = df['description'].apply(str).apply(clean_text)

In [7]:
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,cobb skill thief commit corpor espionag infilt...,2010-07-15,8.368,35811,"Action, Science Fiction, Adventure","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan
1,157336,Interstellar,adventur group explor make use newli discov wo...,2014-11-05,8.434,34465,"Adventure, Drama, Science Fiction","Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan
2,155,The Dark Knight,batman rais stake war crime help lt jim gordon...,2008-07-16,8.515,32012,"Drama, Action, Crime, Thriller","Christian Bale, Heath Ledger, Michael Caine, G...",Christopher Nolan
3,19995,Avatar,22nd centuri parapleg marin dispatch moon pand...,2009-12-15,7.581,30907,"Action, Adventure, Fantasy, Science Fiction","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron
4,24428,The Avengers,unexpect enemi emerg threaten global safeti se...,2012-04-25,7.714,30090,"Science Fiction, Action, Adventure","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon


In [8]:
def preprocess_special_values(text):
    text = str(text)
    arr = text.split(", ")
    processed_arr = []
    for a in arr:
        processed_arr.append(a.replace(" ", "").lower())
    return " ".join(processed_arr) 

In [9]:
preprocess_special_values("Leonardo DiCaprio, Joseph Gordon-Levitt, Ken Watanabe, Tom Hardy, Elliot Page")

'leonardodicaprio josephgordon-levitt kenwatanabe tomhardy elliotpage'

In [10]:
df['genres'] = df['genres'].apply(preprocess_special_values)
df['actors'] = df['actors'].apply(preprocess_special_values)
df['director'] = df['director'].apply(preprocess_special_values)

In [11]:
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,cobb skill thief commit corpor espionag infilt...,2010-07-15,8.368,35811,action sciencefiction adventure,leonardodicaprio josephgordon-levitt kenwatana...,christophernolan
1,157336,Interstellar,adventur group explor make use newli discov wo...,2014-11-05,8.434,34465,adventure drama sciencefiction,matthewmcconaughey annehathaway michaelcaine j...,christophernolan
2,155,The Dark Knight,batman rais stake war crime help lt jim gordon...,2008-07-16,8.515,32012,drama action crime thriller,christianbale heathledger michaelcaine garyold...,christophernolan
3,19995,Avatar,22nd centuri parapleg marin dispatch moon pand...,2009-12-15,7.581,30907,action adventure fantasy sciencefiction,samworthington zoesaldaña sigourneyweaver step...,jamescameron
4,24428,The Avengers,unexpect enemi emerg threaten global safeti se...,2012-04-25,7.714,30090,sciencefiction action adventure,robertdowneyjr. chrisevans markruffalo chrishe...,josswhedon


In [12]:
df['description'] = df['description'] + " " + df['genres'] + " " + df['actors'] + " "+ df['director']

# Embeddings

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
desc_vectorizer = CountVectorizer(max_features=5000)
desc_vector = desc_vectorizer.fit_transform(df['description'])

In [15]:
desc_vector = pd.DataFrame.sparse.from_spmatrix(desc_vector)

In [16]:
desc_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Training

In [17]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(desc_vector)


0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


# Using Model

In [18]:
def recommend_movies(index, no_of_movies=5):
    movie_row = df.iloc[index]
    print(f"Selected Movie: {movie_row['title']}")
    query_vector = desc_vectorizer.transform([movie_row['description']])
    distances, indices = knn.kneighbors(query_vector, n_neighbors=no_of_movies+1)  
    recommended_df = df.iloc[indices[0][1:]]  

    return recommended_df

recommend_movies(0, 10)

Selected Movie: Inception


Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
416,8373,Transformers: Revenge of the Fallen,sam witwicki leav autobot behind normal life m...,2009-06-19,6.185,8238,sciencefiction action adventure,shialabeouf meganfox joshduhamel tyresegibson ...,michaelbay
400,180,Minority Report,john anderton top precrim cop late21st centuri...,2002-06-20,7.348,8420,sciencefiction action thriller,tomcruise samanthamorton maxvonsydow colinfarr...,stevenspielberg
3763,438590,A-X-L,life teenag boy forev alter chanc encount cut ...,2018-08-23,6.282,1142,sciencefiction action adventure family,alexneustaedter beckyg alexmacnicoll dominicra...,oliverdaly
9537,64956,Inception: The Cobol Job,incept prequel unfold courtesi beauti motion c...,2010-12-07,7.262,300,animation action thriller sciencefiction,leonardodicaprio josephgordon-levitt lukashaas,iankirby
6307,763164,Apex,excop thoma malon serv life sentenc crime didn...,2021-11-12,5.213,558,action thriller sciencefiction,nealmcdonough brucewillis coreylarge alexiafas...,edwarddrake
5618,449992,The Night Comes for Us,spare girl life massacr elit triad assassin ta...,2018-10-05,6.877,657,action crime thriller,joetaslim ikouwais julieestelle sunnypang asha...,timotjahjanto
8212,4283,Primeval,news team sent burundi captur bring home legen...,2007-01-12,6.029,377,adventure horror action,dominicpurcell brookelangton orlandojones jürg...,michaelkatleman
5849,470114,24 Hours to Live,assassin seek redempt given second chanc life ...,2017-10-26,5.9,625,action sciencefiction thriller mystery,ethanhawke xuqing paulanderson rutgerhauer tyr...,briansmrz
2494,154,Star Trek II: The Wrath of Khan,starship enterpris crew pull back action old n...,1982-06-04,7.5,1839,action adventure sciencefiction thriller,williamshatner leonardnimoy deforestkelley jam...,nicholasmeyer
8,118340,Guardians of the Galaxy,light year earth 26 year abduct peter quill fi...,2014-07-30,7.908,27479,action sciencefiction adventure,chrispratt zoesaldaña davebautista vindiesel b...,jamesgunn
