# Movie recommandation system

In [4]:
!pip install pandas nltk scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Exploring Dataset

In [5]:
import pandas as pd

df = pd.read_csv("../dataset/movies.csv")
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...",2010-07-15,8.368,35811,"Action, Science Fiction, Adventure","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan
1,157336,Interstellar,The adventures of a group of explorers who mak...,2014-11-05,8.434,34465,"Adventure, Drama, Science Fiction","Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan
2,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,2008-07-16,8.515,32012,"Drama, Action, Crime, Thriller","Christian Bale, Heath Ledger, Michael Caine, G...",Christopher Nolan
3,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-15,7.581,30907,"Action, Adventure, Fantasy, Science Fiction","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron
4,24428,The Avengers,When an unexpected enemy emerges and threatens...,2012-04-25,7.714,30090,"Science Fiction, Action, Adventure","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon


In [6]:
df.shape

(10000, 9)

In [7]:
df.columns

Index(['id', 'title', 'description', 'release_date', 'rating', 'vote_count',
       'genres', 'actors', 'director'],
      dtype='object')

# Preprocessing

In [8]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download required data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Setup preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def clean_text(text):
    """Complete text preprocessing pipeline"""
    text = text.lower().strip()
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to /home/sudarshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sudarshan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
df['description'] = df['description'].apply(str).apply(clean_text)

In [10]:
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,cobb skill thief commit corpor espionag infilt...,2010-07-15,8.368,35811,"Action, Science Fiction, Adventure","Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W...",Christopher Nolan
1,157336,Interstellar,adventur group explor make use newli discov wo...,2014-11-05,8.434,34465,"Adventure, Drama, Science Fiction","Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan
2,155,The Dark Knight,batman rais stake war crime help lt jim gordon...,2008-07-16,8.515,32012,"Drama, Action, Crime, Thriller","Christian Bale, Heath Ledger, Michael Caine, G...",Christopher Nolan
3,19995,Avatar,22nd centuri parapleg marin dispatch moon pand...,2009-12-15,7.581,30907,"Action, Adventure, Fantasy, Science Fiction","Sam Worthington, Zoe Saldaña, Sigourney Weaver...",James Cameron
4,24428,The Avengers,unexpect enemi emerg threaten global safeti se...,2012-04-25,7.714,30090,"Science Fiction, Action, Adventure","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",Joss Whedon


In [11]:
def preprocess_special_values(text):
    text = str(text)
    arr = text.split(", ")
    processed_arr = []
    for a in arr:
        processed_arr.append(a.replace(" ", "").lower())
    return " ".join(processed_arr) 

In [12]:
preprocess_special_values("Leonardo DiCaprio, Joseph Gordon-Levitt, Ken Watanabe, Tom Hardy, Elliot Page")

'leonardodicaprio josephgordon-levitt kenwatanabe tomhardy elliotpage'

In [13]:
df['genres'] = df['genres'].apply(preprocess_special_values)
df['actors'] = df['actors'].apply(preprocess_special_values)
df['director'] = df['director'].apply(preprocess_special_values)

In [14]:
df.head()

Unnamed: 0,id,title,description,release_date,rating,vote_count,genres,actors,director
0,27205,Inception,cobb skill thief commit corpor espionag infilt...,2010-07-15,8.368,35811,action sciencefiction adventure,leonardodicaprio josephgordon-levitt kenwatana...,christophernolan
1,157336,Interstellar,adventur group explor make use newli discov wo...,2014-11-05,8.434,34465,adventure drama sciencefiction,matthewmcconaughey annehathaway michaelcaine j...,christophernolan
2,155,The Dark Knight,batman rais stake war crime help lt jim gordon...,2008-07-16,8.515,32012,drama action crime thriller,christianbale heathledger michaelcaine garyold...,christophernolan
3,19995,Avatar,22nd centuri parapleg marin dispatch moon pand...,2009-12-15,7.581,30907,action adventure fantasy sciencefiction,samworthington zoesaldaña sigourneyweaver step...,jamescameron
4,24428,The Avengers,unexpect enemi emerg threaten global safeti se...,2012-04-25,7.714,30090,sciencefiction action adventure,robertdowneyjr. chrisevans markruffalo chrishe...,josswhedon


In [15]:
df['description'] = df['description'] + " " + df['genres'] + " " + df['actors'] + " "+ df['director']

# Embeddings

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
desc_vectorizer = CountVectorizer(max_features=5000)
desc_vector = desc_vectorizer.fit_transform(df['description'])

In [18]:
desc_vector = pd.DataFrame.sparse.from_spmatrix(desc_vector)

In [19]:
desc_vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Training

In [20]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(desc_vector)


0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",10
,"radius  radius: float, default=1.0 Range of parameter space to use by default for :meth:`radius_neighbors` queries.",1.0
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"metric  metric: str or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string.",'cosine'
,"p  p: float (positive), default=2 Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.",2
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",


# Using Model

In [21]:
def recommend_movies(id, no_of_movies=5):
    match = df.loc[df['id'] == id]
    if match.empty:
        return []

    row = match.iloc[0]
    query_vector = desc_vectorizer.transform([row['description']])
    distances, indices = knn.kneighbors(query_vector, n_neighbors=no_of_movies+1)

    # drop the first one (the movie itself)
    distances = distances[0][1:]
    idxs = indices[0][1:]

    # convert distance → similarity/confidence
    confidences = 1 / (1 + distances)

    recs = df.iloc[idxs][['id', 'title']].copy()
    recs['confidence'] = confidences.round(2)

    return recs.to_dict(orient='records')

In [22]:
recommend_movies(2882, 10)

[{'id': 522098, 'title': 'Babyteeth', 'confidence': 0.61},
 {'id': 417678, 'title': 'Everything, Everything', 'confidence': 0.61},
 {'id': 20, 'title': 'My Life Without Me', 'confidence': 0.61},
 {'id': 367544, 'title': 'The Spirit of Christmas', 'confidence': 0.61},
 {'id': 47735, 'title': 'Summer with Monika', 'confidence': 0.61},
 {'id': 459,
  'title': 'Sissi: The Fateful Years of an Empress',
  'confidence': 0.6},
 {'id': 4254, 'title': 'Kal Ho Naa Ho', 'confidence': 0.6},
 {'id': 664413, 'title': '365 Days', 'confidence': 0.6},
 {'id': 416477, 'title': 'The Big Sick', 'confidence': 0.6},
 {'id': 763148, 'title': 'Time Is Up', 'confidence': 0.59}]

# Saving the model

In [24]:
import joblib

joblib.dump(df, "../app/artifacts/movies.pkl")
joblib.dump(desc_vectorizer, "../app/artifacts/vectorizer.pkl")
joblib.dump(knn, "../app/artifacts/knn.pkl")

['../app/artifacts/knn.pkl']