In [1]:

"""Hybrid_Model.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/16PIzDuQCOPS8it9RfX-CL2vZVMfNfvQ6
"""

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/data/MovieLens/'

ratings = pd.read_csv(path + 'ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

movies = pd.read_csv(path + 'movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')

users = pd.read_csv(path + 'users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])

# Display sample data
ratings.head()


Mounted at /content/drive


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
# Check for missing values in each dataframe
print("Missing values in ratings:")
print(ratings.isnull().sum())

print("\nMissing values in movies:")
print(movies.isnull().sum())

print("\nMissing values in users:")
print(users.isnull().sum())

# Genres column has multiple genres per movie separated by |
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres
movies['Genres'] = movies['Genres'].apply(lambda x: x.split('|'))

# Binarize
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(movies['Genres']),
                             columns=mlb.classes_,
                             index=movies.index)

# Concatenate back to movies
movies = pd.concat([movies.drop('Genres', axis=1), genre_encoded], axis=1)

movies.head()

Missing values in ratings:
UserID       0
MovieID      0
Rating       0
Timestamp    0
dtype: int64

Missing values in movies:
MovieID    0
Title      0
Genres     0
dtype: int64

Missing values in users:
UserID        0
Gender        0
Age           0
Occupation    0
Zip-code      0
dtype: int64


Unnamed: 0,MovieID,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# Identify numerical columns in ratings dataframe
numerical_cols_ratings = ratings.select_dtypes(include=['int64', 'float64']).columns

print("Numerical columns in ratings dataframe:", numerical_cols_ratings)

# Apply IQR method to each numerical column in ratings dataframe
for col in numerical_cols_ratings:
    Q1 = ratings[col].quantile(0.25)
    Q3 = ratings[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = ratings[(ratings[col] < lower_bound) | (ratings[col] > upper_bound)]

    print(f"\nOutliers in '{col}' column of ratings dataframe:")
    if outliers.empty:
        print("No outliers found.")
    else:
        print(outliers)


#Labelencoding

le_gender = LabelEncoder()
users['Gender'] = le_gender.fit_transform(users['Gender'])  # M=1, F=0

users.head()


Numerical columns in ratings dataframe: Index(['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype='object')

Outliers in 'UserID' column of ratings dataframe:
No outliers found.

Outliers in 'MovieID' column of ratings dataframe:
No outliers found.

Outliers in 'Rating' column of ratings dataframe:
         UserID  MovieID  Rating  Timestamp
148           2       21       1  978299839
180           2     3893       1  978299535
205           3     1261       1  978297663
249           4     3527       1  978294008
268           5     2916       1  978245645
...         ...      ...     ...        ...
1000171    6040     3388       1  956716407
1000177    6040     2751       1  956716438
1000185    6040     2794       1  956716438
1000187    6040     2003       1  956716294
1000204    6040     1091       1  956716541

[56174 rows x 4 columns]

Outliers in 'Timestamp' column of ratings dataframe:
         UserID  MovieID  Rating   Timestamp
2327         19      318       4   994556598
24

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,0,1,10,48067
1,2,1,56,16,70072
2,3,1,25,15,55117
3,4,1,45,7,2460
4,5,1,25,20,55455


In [6]:
#Embedding on title column
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
title_embeddings = tfidf.fit_transform(movies['Title'])

"""# hybrid recommendation system that combines collaborative filtering (CF) and content-based filtering (CBF) using TF-IDF on metadata"""

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# Combine genre columns into a string for each movie
genre_columns = movies.columns[2:]
movies['Genres'] = movies[genre_columns].apply(lambda row: ' '.join(genre for genre, val in row.items() if val == 1), axis=1)

#  Content-Based Filtering
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['Genres'])
content_sim_matrix = cosine_similarity(tfidf_matrix)
content_sim_df = pd.DataFrame(content_sim_matrix, index=movies['MovieID'], columns=movies['MovieID'])

# Collaborative Filtering
user_item_matrix = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating').fillna(0)
item_item_matrix = user_item_matrix.T
collab_sim_matrix = cosine_similarity(item_item_matrix)
collab_sim_df = pd.DataFrame(collab_sim_matrix, index=item_item_matrix.index, columns=item_item_matrix.index)

# Align indices
common_movie_ids = collab_sim_df.index # These are the movie IDs present in ratings
collab_sim_df = collab_sim_df.loc[common_movie_ids, common_movie_ids]
content_sim_df = content_sim_df.loc[common_movie_ids, common_movie_ids]

# Hybrid Similarity
alpha = 0.1
hybrid_sim_df = alpha * content_sim_df + (1 - alpha) * collab_sim_df

# Predict rating using hybrid similarity
def hybrid_predict_rating(user_id, movie_id, k=10):
    if movie_id not in user_item_matrix.columns or user_id not in user_item_matrix.index:
        return 3.0

    user_ratings = user_item_matrix.loc[user_id]
    sim_scores = hybrid_sim_df.get(movie_id, pd.Series(dtype=float))
    if sim_scores.empty:
        return 3.0

    rated_movies = user_ratings[user_ratings > 0].index
    sim_scores = sim_scores[rated_movies]
    top_k = sim_scores.sort_values(ascending=False).head(k)

    if top_k.empty:
        return 3.0

    numerator = (top_k * user_ratings[top_k.index]).sum()
    denominator = top_k.sum()
    return numerator / denominator if denominator != 0 else 3.0

# Evaluate (Optional)
sample = ratings.sample(1000, random_state=42)
preds = sample.apply(lambda x: hybrid_predict_rating(x['UserID'], x['MovieID']), axis=1)
rmse = sqrt(mean_squared_error(sample['Rating'], preds))
mae = mean_absolute_error(sample['Rating'], preds)

print("RMSE:", rmse)
print("MAE:", mae)



RMSE: 0.7470120313169964
MAE: 0.5645129382254673


In [7]:
from sklearn.model_selection import train_test_split
import numpy as np

# Train-Test Split
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
train_user_item_matrix = train_data.pivot_table(index='UserID', columns='MovieID', values='Rating').fillna(0)

# Prediction Function Using Train Matrix
def hybrid_predict_rating_train(user_id, movie_id, k=10):
    if movie_id not in train_user_item_matrix.columns or user_id not in train_user_item_matrix.index:
        return 3.0
    user_ratings = train_user_item_matrix.loc[user_id]
    sim_scores = hybrid_sim_df.get(movie_id, pd.Series(dtype=float))
    if sim_scores.empty:
        return 3.0
    rated_movies = user_ratings[user_ratings > 0].index
    sim_scores = sim_scores[rated_movies]
    top_k = sim_scores.sort_values(ascending=False).head(k)
    if top_k.empty:
        return 3.0
    numerator = (top_k * user_ratings[top_k.index]).sum()
    denominator = top_k.sum()
    return numerator / denominator if denominator != 0 else 3.0

# Generate Predictions for Multiple Users
def get_predictions_for_precision_at_k(user_ids, k=10):
    results = []
    for user_id in user_ids:
        if user_id not in train_user_item_matrix.index:
            continue

        # Movies user has not rated in training
        rated_train_movies = set(train_user_item_matrix.loc[user_id][train_user_item_matrix.loc[user_id] > 0].index)
        candidate_movies = [m for m in movies['MovieID'] if m not in rated_train_movies]

        # Predict ratings
        predictions = [(movie_id, hybrid_predict_rating_train(user_id, movie_id)) for movie_id in candidate_movies]
        predictions_sorted = sorted(predictions, key=lambda x: x[1], reverse=True)
        top_k_predicted = [movie_id for movie_id, _ in predictions_sorted[:k]]

        # Get relevant items from test set (those rated ≥ 4)
        user_test_ratings = test_data[test_data['UserID'] == user_id]
        relevant_items = user_test_ratings[user_test_ratings['Rating'] >= 4]['MovieID'].tolist()

        results.append((top_k_predicted, relevant_items))

    return results

# Your Existing Function
def precision_at_k(predictions, k=10):
    total_precision = 0
    for predicted_items, ground_truth_items in predictions:
        top_k_predictions = predicted_items[:k]
        relevant_items_count = len(set(top_k_predictions) & set(ground_truth_items))
        precision = relevant_items_count / k if k > 0 else 0
        total_precision += precision
    return total_precision / len(predictions) if len(predictions) > 0 else 0

#  Evaluate
sample_users = np.random.choice(test_data['UserID'].unique(), size=100, replace=False)
prediction_data = get_predictions_for_precision_at_k(sample_users, k=10)
precision_score = precision_at_k(prediction_data, k=10)

print(f"Precision@10: {precision_score:.4f}")

for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
    hybrid_sim_df = alpha * content_sim_df + (1 - alpha) * collab_sim_df
    preds = sample.apply(lambda x: hybrid_predict_rating(x['UserID'], x['MovieID']), axis=1)
    print(f"Alpha={alpha} | RMSE={sqrt(mean_squared_error(sample['Rating'], preds)):.4f} | MAE={mean_absolute_error(sample['Rating'], preds):.4f}")

Precision@10: 0.0440
Alpha=0.1 | RMSE=0.7470 | MAE=0.5645
Alpha=0.3 | RMSE=0.7739 | MAE=0.5896
Alpha=0.5 | RMSE=0.8060 | MAE=0.6162
Alpha=0.7 | RMSE=0.8345 | MAE=0.6450
Alpha=0.9 | RMSE=0.8593 | MAE=0.6685


In [9]:
import pickle

with open('/content/Hybrid_model.pkl', 'wb') as f:
    pickle.dump(hybrid_sim_df, f)

from google.colab import files
files.download('/content/Hybrid_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>