In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("/home/shigilsasi/code/Guvi_Projects/IMDB_Movie_Recommendation_System_Using_Storylines/imdb_2024_movies.csv")
df.head()

Unnamed: 0,Movie Title,Storyline
0,We Bury the Dead,"After a catastrophic military disaster, the de..."
1,The Life of Chuck,"A life-affirming, genre-bending story about th..."
2,The Substance,A fading celebrity takes a black-market drug: ...
3,Dune: Part Two,Paul Atreides unites with the Fremen while on ...
4,Eden,Based on a factual account of a group of outsi...


In [3]:
df.shape

(10000, 2)

In [4]:
df.isnull().sum()   

Movie Title    0
Storyline      0
dtype: int64

In [5]:
df.dtypes

Movie Title    object
Storyline      object
dtype: object

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [8]:
df['Combined_movie_storyline'] = df['Movie Title'] + " " + df['Storyline']

In [9]:
df['Combined_movie_storyline']

0       We Bury the Dead After a catastrophic military...
1       The Life of Chuck A life-affirming, genre-bend...
2       The Substance A fading celebrity takes a black...
3       Dune: Part Two Paul Atreides unites with the F...
4       Eden Based on a factual account of a group of ...
                              ...                        
9995    Narudi Brathuku Natana Sathya, an aspiring but...
9996    Karma Wallet A drama-thriller exploring the un...
9997    Hate Songs Two actors and a technician are gat...
9998    Beyond, Ode to the Earth There isn't really a ...
9999    Estamos no Ar A son, his shy mother and unhapp...
Name: Combined_movie_storyline, Length: 10000, dtype: object

In [10]:
tdif_object = TfidfVectorizer(stop_words='english', max_features=1500, ngram_range=(1,2))

tdif_matrix = tdif_object.fit_transform(df['Combined_movie_storyline']) 
tdif_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 111380 stored elements and shape (10000, 1500)>

In [11]:
feature_names = tdif_object.get_feature_names_out()\

feature_names

array(['000', '10', '11', ..., 'younger', 'youth', 'zombie'],
      shape=(1500,), dtype=object)

In [12]:
tdif_df = pd.DataFrame(tdif_matrix.toarray(), columns=feature_names)

tdif_df.head()

Unnamed: 0,000,10,11,12,13,14,15,17,19,20,...,years later,york,young,young couple,young girl,young man,young woman,younger,youth,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385887,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
def search_movie(query, k = 5):
    query = query.lower()
    query_vec = tdif_object.transform([query])
    similarity = cosine_similarity(query_vec, tdif_matrix).flatten()
    
    top_indeces = similarity.argsort()[-k:][::-1]
    result = df.iloc[top_indeces]

    return result[['Movie Title', 'Storyline']]

In [14]:
search_movie("ruthless quest for vengeance after his brother ", 10)

Unnamed: 0,Movie Title,Storyline
508,Marco,"The adoptive son of the Adattu family, Marco, ..."
9217,Dheera Samrat,A family that connects through the habitual gr...
7690,Uttarakaanda,A former gangster named Byrappa Nayaka disrupt...
9069,Fist of Vengeance,No storyline available
2621,12 Gaun,A son's quest for vengeance against a tyrannic...
1484,Yudhra,"A young man consumed by vengeance, Yudhra infi..."
115,Maharaja,A barber seeks vengeance after his home is bur...
29,Kraven the Hunter,Kraven's complex relationship with his ruthles...
2547,Guardian Angel,Police Officer Devan was a very truthful man b...
1206,Kill Em All 2,Phillip comes face to face with a Russian-Fren...
