In [11]:
# Importing Libraries

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import warnings

In [13]:
# Load the dataset
movies = pd.read_csv('movies.csv')

# Display dataset info
print(f'Number of movies in dataset: {movies.shape[0]}')
print("Missing values before handling:\n", movies.isnull().sum())

# Fill missing numerical values with mean or median
movies['score'] = movies['score'].fillna(movies['score'].mean())
movies['votes'] = movies['votes'].fillna(movies['votes'].mean())
movies['runtime'] = movies['runtime'].fillna(movies['runtime'].mean())
movies['budget'] = movies['budget'].fillna(movies['budget'].median())
movies['gross'] = movies['gross'].fillna(movies['gross'].median())

Number of movies in dataset: 7668
Missing values before handling:
 name           0
rating        77
genre          0
year           0
released       2
score          3
votes          3
director       0
writer         3
star           1
country        3
budget      2171
gross        189
company       17
runtime        4
dtype: int64


In [15]:
# Fill missing categorical values with 'Unknown'
for col in ['rating', 'released', 'writer', 'star', 'country', 'company']:
    movies[col] = movies[col].fillna('Unknown')


# Verify missing values are handled
print("Missing values after handling:\n", movies.isnull().sum())

Missing values after handling:
 name        0
rating      0
genre       0
year        0
released    0
score       0
votes       0
director    0
writer      0
star        0
country     0
budget      0
gross       0
company     0
runtime     0
dtype: int64


In [17]:
# Create a combined text feature for content-based filtering
movies['direct_gen'] = movies[['director', 'genre', 'star']].fillna('').astype(str).apply(lambda x: ' '.join(x), axis=1)

# Creating an index mapping for movie titles
titles = movies['name']
indices = pd.Series(movies.index, index=movies['name']).drop_duplicates()

In [19]:
# Computing content-based similarity using TF-IDF
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
tf_authTags_matrix = tf.fit_transform(movies['direct_gen'])
cosine_sim_authTags = cosine_similarity(tf_authTags_matrix, tf_authTags_matrix)

In [21]:
# Computing collaborative filtering similarity (user ratings)
scaler = MinMaxScaler()
movies[['score', 'votes']] = scaler.fit_transform(movies[['score', 'votes']])
rating_sim = cosine_similarity(movies[['score', 'votes']], movies[['score', 'votes']])

print(f"Content-based Similarity Matrix Shape: {cosine_sim_authTags.shape}")
print(f"Collaborative Similarity Matrix Shape: {rating_sim.shape}")

Content-based Similarity Matrix Shape: (7668, 7668)
Collaborative Similarity Matrix Shape: (7668, 7668)


In [23]:
# Hybrid Recommendation System
def hybrid_recommendation(movie_title, N=10, alpha=0.7):
    if movie_title not in indices:
        return f"Movie '{movie_title}' not found in dataset."

    # Getting the index of the movie
    b_idx = indices[movie_title]
    
    # Extracting content-based similarity
    content_sim_scores = list(enumerate(cosine_sim_authTags[b_idx]))
    content_sim_scores = sorted(content_sim_scores, key=lambda x: x[1], reverse=True)
    
    # Extracting collaborative filtering similarity
    rating_sim_scores = list(enumerate(rating_sim[b_idx]))
    rating_sim_scores = sorted(rating_sim_scores, key=lambda x: x[1], reverse=True)
    
    # Ensuring N recommendations exist
    content_sim_scores = content_sim_scores[1:min(N+1, len(content_sim_scores))]
    rating_sim_scores = rating_sim_scores[1:min(N+1, len(rating_sim_scores))]

    # Hybrid similarity scores
    hybrid_sim_scores = []
    for i in range(min(len(content_sim_scores), len(rating_sim_scores))):
        combined_score = (alpha * content_sim_scores[i][1]) + ((1 - alpha) * rating_sim_scores[i][1])
        hybrid_sim_scores.append((content_sim_scores[i][0], combined_score))
    
    # Sorting by combined score
    hybrid_sim_scores = sorted(hybrid_sim_scores, key=lambda x: x[1], reverse=True)
    
    # Fetching movie recommendations
    movie_indices = [i[0] for i in hybrid_sim_scores]
    return movies.iloc[movie_indices][['name', 'score', 'votes', 'genre']]
    

In [35]:
    # Testing
    print(hybrid_recommendation("The Matrix", N=5, alpha=0.7))

                        name     score     votes   genre
4256     The Matrix Reloaded  0.716216  0.220414  Action
4268  The Matrix Revolutions  0.648649  0.191248  Action
6686       Jupiter Ascending  0.459459  0.075831  Action
5310             Speed Racer  0.554054  0.029164  Action
2903          Chain Reaction  0.513514  0.020414  Action


In [29]:
def avg_cosine_similarity(movie_title, N=5):
    idx = indices.get(movie_title)
    if idx is None:
        return None
    
    recommended = hybrid_recommendation(movie_title, N)
    recommended_indices = recommended.index
    similarities = cosine_sim_authTags[idx, recommended_indices]
    return similarities.mean()

print("Avg Cosine Similarity:", avg_cosine_similarity("The Matrix", N=5))


Avg Cosine Similarity: 0.7137572594643882


In [37]:
def avg_popularity(movie_title, N=10):
    recs = hybrid_recommendation(movie_title, N)
    return recs[['score', 'votes']].mean()

print(avg_popularity("The Matrix", N=5))


score    0.578378
votes    0.107414
dtype: float64
