In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("/home/shigilsasi/code/Guvi_Projects/IMDB_Movie_Recommendation_System_Using_Storylines/imdb_2024_movies.csv")
df.head()

Unnamed: 0,Movie Title,Storyline
0,The Life of Chuck,"A life-affirming, genre-bending story about th..."
1,The Substance,A fading celebrity takes a black-market drug: ...
2,Trap,A father and his teen daughter attend a pop co...
3,Beetlejuice Beetlejuice,"After a family tragedy, three generations of t..."
4,Anora,A young stripper from Brooklyn meets and impul...


In [3]:
df.shape

(10000, 2)

In [4]:
df.isnull().sum()   

Movie Title    0
Storyline      0
dtype: int64

In [5]:
df.dtypes

Movie Title    object
Storyline      object
dtype: object

In [6]:
df.duplicated().sum()

np.int64(2)

In [7]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [8]:
df['Combined_movie_storyline'] = df['Movie Title'] + " " + df['Storyline']

In [9]:
df['Combined_movie_storyline']

0       The Life of Chuck A life-affirming, genre-bend...
1       The Substance A fading celebrity takes a black...
2       Trap A father and his teen daughter attend a p...
3       Beetlejuice Beetlejuice After a family tragedy...
4       Anora A young stripper from Brooklyn meets and...
                              ...                        
9995    Dame After several years in abject poverty, a ...
9996    Il pirata, memorie da Spoon River No storyline...
9997    KSI & The Pauls: Primed for Success KSI and Th...
9998    The Legal and Underground Scene of Magic Mushr...
9999    Wisconsin Lighthouses Voyage into the past to ...
Name: Combined_movie_storyline, Length: 9998, dtype: object

In [11]:
tdif_object = TfidfVectorizer(stop_words='english', max_features=1500, ngram_range=(1,2))

tdif_matrix = tdif_object.fit_transform(df['Combined_movie_storyline']) 
tdif_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 109987 stored elements and shape (9998, 1500)>

In [12]:
feature_names = tdif_object.get_feature_names_out()\

feature_names

array(['000', '10', '11', ..., 'younger', 'youth', 'zombie'],
      shape=(1500,), dtype=object)

In [13]:
tdif_df = pd.DataFrame(tdif_matrix.toarray(), columns=feature_names)

tdif_df.head()

Unnamed: 0,000,10,11,12,13,14,15,17,19,20,...,years later,york,young,young couple,young girl,young man,young woman,younger,youth,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.417012,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.315291,0.180818,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def search_movie(query, k = 5):
    query = query.lower()
    query_vec = tdif_object.transform([query])
    similarity = cosine_similarity(query_vec, tdif_matrix).flatten()
    
    top_indeces = similarity.argsort()[-k:][::-1]
    result = df.iloc[top_indeces]

    return result[['Movie Title', 'Storyline']]

In [18]:
search_movie("ruthless quest for vengeance after his brother ", 10)

Unnamed: 0,Movie Title,Storyline
352,Marco,"The adoptive son of the Adattu family, Marco, ..."
7005,Uttarakaanda,A former gangster named Byrappa Nayaka disrupt...
7957,Fist of Vengeance,No storyline available
9672,Ruthless,No storyline available
1555,Yudhra,"A young man consumed by vengeance, Yudhra infi..."
135,Maharaja,A barber seeks vengeance after his home is bur...
8940,Wives on Strike: The Uprising,"An honor graduate, is kidnapped upon returning..."
53,Kraven the Hunter,Kraven's complex relationship with his ruthles...
2720,Guardian Angel,Police Officer Devan was a very truthful man b...
2646,12 Gaun,A son's quest for vengeance against a tyrannic...
