In [1]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163496 sha256=2e9037214b9fb8c6fd224f415726e06fd4783bee8260f1a81362fce91055cdee
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [2]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip

--2023-12-01 13:18:10--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-12-01 13:18:11 (26.8 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [3]:
import zipfile
import os
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV

In [4]:
# Extract the MovieLens 100K dataset
with zipfile.ZipFile('ml-100k.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

##1. Data Loading and Splitting:
The MovieLens 100K dataset is loaded from the 'ml-100k.zip' file.
The data is structured with user, item, rating, and timestamp information.
The dataset is split into training and testing sets using a 75-25 split ratio.

In [25]:

# Load the MovieLens 100K dataset
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(os.path.join('ml-100k', 'u.data'), reader=reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)


In [26]:
from surprise import dump
import os

# Define the folder path
folder_path = 'benchmark'

# Check if the folder exists
if not os.path.exists(folder_path):
    # Create the folder if it doesn't exist
    os.makedirs(folder_path)

# Save the test set to a file
testset_path = 'benchmark/testset.pkl'
with open(testset_path, 'wb') as file:
    pickle.dump(testset, file)


##2. Collaborative Filtering Model (SVD):
Singular Value Decomposition (SVD) is chosen as the collaborative filtering model.
SVD decomposes the user-item interaction matrix into user and item latent factor matrices.

In [9]:
# Set up and train the collaborative filtering model (SVD)
model = SVD()

##3. Hyperparameter Tuning with Grid Search:
A grid search is performed to find the optimal hyperparameters for the SVD model.
The grid includes the number of epochs, learning rate, and regularization terms.

In [10]:
# Define parameter grid for grid search
param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.1, 0.2]}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)
grid_search.fit(data)

# Get the best model
best_model = grid_search.best_estimator['rmse']

##4. Model Training and Testing:
The best model obtained from the grid search is trained on the entire dataset.

In [11]:
# Train the best model on the entire dataset
best_model.fit(data.build_full_trainset())

# Generate predictions for the test set
predictions = best_model.test(testset)


##5. User and Item Embeddings:
The trained model provides user and item embeddings.

In the context of collaborative filtering and matrix factorization techniques like Singular Value Decomposition (SVD), user and item embeddings are representations of users and items in a lower-dimensional latent space. These embeddings capture latent factors that characterize the preferences of users and the properties of items.  During the training of collaborative filtering models like SVD, these embeddings are learned based on the historical interactions between users and items.

In [12]:
# Use the trained model to get user and item embeddings
user_embeddings = best_model.pu
item_embeddings = best_model.qi

##6. User-Item Similarity Calculation:
Cosine similarity is calculated between user and item embeddings to measure similarity.

In [13]:
# Calculate user-item similarity based on cosine similarity
user_item_similarity = cosine_similarity(user_embeddings, item_embeddings)


##7. Recommendation Generation:
Recommendations are generated for users 1 to 9 based on their historical interactions.
For each user, the code identifies movies they have seen and generates predictions for movies they haven't seen.
The top 5 recommendations with the highest predicted ratings are selected for each user.

In [14]:
# Recommendation generation
recommendations = []
for uid in range(1, 10):  # Generate recommendations for users 1 to 9
    # Get the items that the user has already interacted with
    seen_iids = [iid for iid, _ in trainset.ur[trainset.to_inner_uid(str(uid))]]
    print(f"\nUser {uid} has seen {len(seen_iids)} movies")

    # Predict ratings for all items the user hasn't seen yet
    user_predictions = [best_model.predict(str(uid), trainset.to_raw_iid(iid), verbose=False) for iid in trainset.all_items() if iid not in seen_iids]
    print(f"User {uid} has {len(user_predictions)} predictions")

    # Sort these predictions by estimated rating
    user_predictions.sort(key=lambda x: x.est, reverse=True)

    # Get top 5 recommendations for each user
    top_n_recommendations = [prediction.iid for prediction in user_predictions[:5]]

    recommendations.append(top_n_recommendations)


User 1 has seen 194 movies
User 1 has 1450 predictions

User 2 has seen 47 movies
User 2 has 1597 predictions

User 3 has seen 37 movies
User 3 has 1607 predictions

User 4 has seen 14 movies
User 4 has 1630 predictions

User 5 has seen 127 movies
User 5 has 1517 predictions

User 6 has seen 167 movies
User 6 has 1477 predictions

User 7 has seen 306 movies
User 7 has 1338 predictions

User 8 has seen 39 movies
User 8 has 1605 predictions

User 9 has seen 18 movies
User 9 has 1626 predictions


##8. Print Recommendations:
Finally, the code prints the recommendations for each user along with movie details if available.

In [15]:
# Load movie details from u.item
movie_details = {}
with open(os.path.join('ml-100k', 'u.item'), encoding='ISO-8859-1') as f:
    for line in f:
        parts = line.strip().split('|')
        movie_id = int(parts[0])
        movie_title = parts[1]
        movie_details[movie_id] = {'title': movie_title}

# Print movie details for the recommendations
for i, user_rec in enumerate(recommendations, start=1):
    print(f"\nTop Recommendations for User {i}:")
    if user_rec:  # Check if there are recommendations for the user
        for item_id in user_rec:
            movie_info = movie_details.get(int(item_id))
            if movie_info:
                print(f"Movie ID: {item_id}, Title: {movie_info['title']}")
    else:
        print("No recommendations available for this user")


Top Recommendations for User 1:
Movie ID: 408, Title: Close Shave, A (1995)
Movie ID: 1449, Title: Pather Panchali (1955)
Movie ID: 318, Title: Schindler's List (1993)
Movie ID: 483, Title: Casablanca (1942)
Movie ID: 603, Title: Rear Window (1954)

Top Recommendations for User 2:
Movie ID: 408, Title: Close Shave, A (1995)
Movie ID: 64, Title: Shawshank Redemption, The (1994)
Movie ID: 1449, Title: Pather Panchali (1955)
Movie ID: 169, Title: Wrong Trousers, The (1993)
Movie ID: 318, Title: Schindler's List (1993)

Top Recommendations for User 3:
Movie ID: 169, Title: Wrong Trousers, The (1993)
Movie ID: 320, Title: Paradise Lost: The Child Murders at Robin Hood Hills (1996)
Movie ID: 408, Title: Close Shave, A (1995)
Movie ID: 114, Title: Wallace & Gromit: The Best of Aardman Animation (1996)
Movie ID: 12, Title: Usual Suspects, The (1995)

Top Recommendations for User 4:
Movie ID: 408, Title: Close Shave, A (1995)
Movie ID: 483, Title: Casablanca (1942)
Movie ID: 318, Title: Schind

##9. Load Demographic Information:
Demographic information about users is loaded from the 'u.user' file, including age, gender, occupation, and zip code.
The age column is scaled using standard scaling.

In [16]:
# Load demographic information from u.user
user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv(os.path.join('/content', 'ml-100k', 'u.user'), sep='|', names=user_columns)
users_cp = users.copy()
# Drop non-numeric columns and scale numeric columns
numeric_columns = ['age']
scaler = StandardScaler()
users[numeric_columns] = scaler.fit_transform(users[numeric_columns])

##10. Merge Demographic Information with User Embeddings:
The user embeddings obtained from the collaborative filtering model are combined with the scaled demographic information.
This creates a new set of features that includes both collaborative filtering embeddings and demographic information.

In [17]:
# Merge demographic information with user embeddings
user_embeddings_df = pd.DataFrame(user_embeddings, columns=[f'feature_{i}' for i in range(user_embeddings.shape[1])])
user_info_embeddings = pd.concat([users[['user_id']], user_embeddings_df], axis=1)


##11. Calculate User-Item Similarity:
Cosine similarity is calculated between the combined user information embeddings and the item embeddings.
This yields a similarity matrix indicating how similar each user is to each item based on both collaborative filtering and demographic features.

In [18]:
# Calculate user-item similarity based on both collaborative filtering embeddings and demographic information
user_item_similarity_combined = cosine_similarity(user_info_embeddings.iloc[:, 1:], item_embeddings)


##12. Recommendation Generation:
For each user (from 1 to 9 in this case), the code sorts the combined user-item similarity scores and selects the top 5 items as recommendations then prints the demographic information for each user and the movie details for the recommended items


In [19]:
# Recommendation generation
recommendations = []
for uid in range(1, 10):  # Generate recommendations for users 1 to 9
    # Sort user-item similarity scores
    combined_similarity = user_item_similarity_combined[uid - 1, :]
    top_n_items = combined_similarity.argsort()[::-1][:5]

    recommendations.append(top_n_items)
# Load movie details from u.item
movie_details = {}
with open(os.path.join('/content', 'ml-100k', 'u.item'), encoding='ISO-8859-1') as f:
    for line in f:
        parts = line.strip().split('|')
        movie_id = int(parts[0])
        movie_title = parts[1]
        movie_details[movie_id] = {'title': movie_title}

# Print movie details and user demographic for the recommendations
for i, user_rec in enumerate(recommendations, start=1):
    print(f"\nRecommendations for User {i}:")
    print(f"User Demographic: {users_cp.loc[i-1, ['age', 'gender', 'occupation', 'zip_code']]}")
    for item_id in user_rec:
        movie_info = movie_details.get(item_id + 1)  # item_id in Surprise is 1-indexed
        if movie_info:
            print(f"Movie ID: {item_id + 1}, Title: {movie_info['title']}")



Recommendations for User 1:
User Demographic: age                   24
gender                 M
occupation    technician
zip_code           85711
Name: 0, dtype: object
Movie ID: 439, Title: Amityville: A New Generation (1993)
Movie ID: 1089, Title: Speed 2: Cruise Control (1997)
Movie ID: 38, Title: Net, The (1995)
Movie ID: 835, Title: Gay Divorcee, The (1934)
Movie ID: 553, Title: Walk in the Clouds, A (1995)

Recommendations for User 2:
User Demographic: age              53
gender            F
occupation    other
zip_code      94043
Name: 1, dtype: object
Movie ID: 896, Title: Sweet Hereafter, The (1997)
Movie ID: 52, Title: Madness of King George, The (1994)
Movie ID: 477, Title: Matilda (1996)
Movie ID: 310, Title: Rainmaker, The (1997)
Movie ID: 141, Title: 20,000 Leagues Under the Sea (1954)

Recommendations for User 3:
User Demographic: age               23
gender             M
occupation    writer
zip_code       32067
Name: 2, dtype: object
Movie ID: 48, Title: Hoop Dreams (

##Save the model

In [20]:
import pickle

# Save the collaborative filtering model and demographic information
model_with_demographics = {
    'model': best_model,
    'user_embeddings': user_embeddings,
    'item_embeddings': item_embeddings,
    'user_info_embeddings': user_info_embeddings,
}

# Define the folder path
folder_path = 'model'

# Check if the folder exists
if not os.path.exists(folder_path):
    # Create the folder if it doesn't exist
    os.makedirs(folder_path)

model_filename = 'model/best_model_with_demographics.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model_with_demographics, file)
