In [13]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Word2Vec



In [20]:
# Load Dataset
file_path = "movies.csv"
df = pd.read_csv(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [21]:
df.describe()

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes
count,1000.0,843.0,1000.0
mean,7.9493,77.97153,273692.9
std,0.275491,12.376099,327372.7
min,7.6,28.0,25088.0
25%,7.7,70.0,55526.25
50%,7.9,79.0,138548.5
75%,8.1,87.0,374161.2
max,9.3,100.0,2343110.0


In [22]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [26]:
df.isnull()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
997,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
998,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True


In [27]:
# Justification for Column Selection
#Released_Year: Helps filter movies by period.
#Certificate:Can be used to filter based on audience suitability.
#Meta_score & IMDB_Rating: Important for ranking and quality assessment.
#Genre: Essential for filtering by user preference.
#Actors: Allows recommendations based on favorite actors.

# Data Cleaning & Preprocessing
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')#converting to numebric and dropping nulls
df = df.dropna(subset=['Released_Year'])#
df['Released_Year'] = df['Released_Year'].astype(int)
df['Certificate'].fillna("Unknown", inplace=True)
df['Meta_score'].fillna(df['Meta_score'].median(), inplace=True)
df['Gross'].fillna("0", inplace=True)#all 3 we are filling missing values
df['Genre'] = df['Genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])#multiple genres so listing for easy understanding
df['Actors'] = df[['Star1', 'Star2', 'Star3', 'Star4']].values.tolist()#merging the actors to one column
df['IMDB_Rating'] = df['IMDB_Rating'].astype(float)#for easy calc

#Text Preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreyasree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shreyasree/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
def clean_text(text):
    return re.sub(r'\W', ' ', text.lower()).strip()

def tokenize_text(text):
    return ' '.join([word for word in word_tokenize(text) if word not in stop_words])

df['cleaned_description'] = df['Overview'].apply(clean_text)
df['tokenized_description'] = df['cleaned_description'].apply(tokenize_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['tokenized_description'])

# Word2Vec Model
word2vec_model = Word2Vec(sentences=df['tokenized_description'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=2, workers=4)

def get_sentence_vector(sentence, model, vector_size=100):
    word_vectors = [model.wv[word] for word in sentence.split() if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(vector_size)

df['word2vec_embedding'] = df['cleaned_description'].apply(lambda x: get_sentence_vector(x, word2vec_model))
word2vec_matrix = np.vstack(df['word2vec_embedding'].values)


In [34]:

def recommend_movies_with_ratings(user_query, df, tfidf_matrix, top_n=5, weight_similarity=0.75, weight_rating=0.25):
    print(f"Finding movies for: {user_query}")
    user_query = tokenize_text(clean_text(user_query))
    query_vector = vectorizer.transform([user_query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    normalized_similarity = MinMaxScaler().fit_transform(cosine_similarities.reshape(-1, 1)).flatten()
    df["Normalized_IMDB"] = df["IMDB_Rating"] / 10
    df["Final_Score"] = (weight_similarity * normalized_similarity) + (weight_rating * df["Normalized_IMDB"])
    return df.sort_values(by=["Final_Score"], ascending=False).head(top_n)[['Series_Title', 'Genre', 'IMDB_Rating', 'Overview', 'Final_Score']]

def recommend_movies_word2vec(user_query, df, word2vec_matrix, top_n=5):
    print(f" Finding movies for: {user_query}")
    user_query_vector = get_sentence_vector(clean_text(user_query), word2vec_model)
    cosine_similarities = cosine_similarity([user_query_vector], word2vec_matrix).flatten()
    normalized_similarity = MinMaxScaler().fit_transform(cosine_similarities.reshape(-1, 1)).flatten()
    top_indices = normalized_similarity.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['Series_Title', 'Genre', 'IMDB_Rating', 'Overview']]



In [37]:
# Test Cases 
test_queries = [
    "A mind-blowing sci-fi film with space travel",
    "A Christopher Nolan thriller with an intense plot",
    "Classic crime drama featuring Al Pacino",
    "Romantic comedy from the early 2000s like mean girls",
    "A horror movie with ghosts and unexpected twists and funny",
    "An inspiring war movie based on true events",
    "A movie starring Tom Hanks from the 90s",
]

# Function to display recommendations in a structured format
def display_recommendations(title, recommendations):
    print(f"\n{'='*80}\n{title}\n{'='*80}")
    
    if recommendations.empty:
        print("No relevant recommendations found.\n")
        return
    
    for index, row in recommendations.head(3).iterrows():  # Show only top 3 results
        print(f"Title: {row['Series_Title']} (IMDb: {row['IMDB_Rating']})")
        print(f"Genre: {', '.join(row['Genre'])}")
        print(f"Overview: {row['Overview'][:250]}...")  # Limit description to 250 characters
        print("-" * 80)

# Run test cases
for query in test_queries:
    print(f"\n{'#'*100}\nUser Query: {query}\n{'#'*100}")
    
    recommendations_tfidf = recommend_movies_with_ratings(query, df, tfidf_matrix)
    recommendations_word2vec = recommend_movies_word2vec(query, df, word2vec_matrix)

    # Display results
    display_recommendations("Top Matches (TF-IDF with IMDb Weighting)", recommendations_tfidf)
    display_recommendations("Top Matches (Word2Vec Similarity)", recommendations_word2vec)



####################################################################################################
User Query: A mind-blowing sci-fi film with space travel
####################################################################################################
Finding movies for: A mind-blowing sci-fi film with space travel
 Finding movies for: A mind-blowing sci-fi film with space travel

Top Matches (TF-IDF with IMDb Weighting)
Title: Interstellar (IMDb: 8.6)
Genre: Adventure, Drama, Sci-Fi
Overview: A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival....
--------------------------------------------------------------------------------
Title: Gattaca (IMDb: 7.8)
Genre: Drama, Sci-Fi, Thriller
Overview: A genetically inferior man assumes the identity of a superior one in order to pursue his lifelong dream of space travel....
--------------------------------------------------------------------------------
Title: Aliens (IMDb: 8.3)
Genre: Act