In [3]:
!pip install implicit

Collecting implicit
  Using cached implicit-0.6.2-cp39-cp39-win_amd64.whl (647 kB)
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [None]:
import os
os.environ['MKL_NUM_THREADS'] = '1'

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from implicit.als import AlternatingLeastSquares

# Generate sample train and test dataframes
# train_df = pd.read_csv('checkpoint_train.csv', encoding = 'utf-8')
# test_df = pd.read_csv('checkpoint_test.csv', encoding = 'utf-8')

train_df = pd.read_csv('checkpoint_train.csv', dtype={'userId': int, 'movieId': int, 'rating': float}, na_values='', keep_default_na=False)
test_df = pd.read_csv('checkpoint_test.csv', dtype={'userId': int, 'movieId': int, 'rating': float}, na_values='', keep_default_na=False)
# Create user-product matrix from train dataframe
user_item_matrix = train_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Get user and product IDs in train set
train_user_ids = set(train_df['userId'].unique())
train_movie_ids = set(train_df['movieId'].unique())

# Get user and product IDs in test set
test_user_ids = set(test_df['userId'].unique())
test_movie_ids = set(test_df['movieId'].unique())

# Get user and product IDs that are present in both sets
user_ids = list(train_user_ids.intersection(test_user_ids))
movie_ids = list(train_movie_ids.intersection(test_movie_ids))

# Create a mapping from the original IDs to the new IDs
user_id_map = dict(zip(user_ids, range(len(user_ids))))
movie_id_map = dict(zip(movie_ids, range(len(movie_ids))))

# Update user and product IDs in train and test dataframes
train_df['userId'] = train_df['userId'].map(user_id_map)
train_df['movieId'] = train_df['movieId'].map(movie_id_map)
test_df['userId'] = test_df['userId'].map(user_id_map)
test_df['movieId'] = test_df['movieId'].map(movie_id_map)

# Convert user-product matrix to sparse matrix
user_item_sparse = sp.csr_matrix(user_item_matrix.loc[user_ids, movie_ids].values)

# Number of latent factors
K = 2

# ALS algorithm using the implicit library
model = AlternatingLeastSquares(factors=K, regularization=0.1, iterations=100)
model.fit(user_item_sparse)

# Get user and product matrices
user_matrix = model.user_factors
product_matrix = model.item_factors

# Predict ratings for test dataframe
test_df['rating'] = np.nan
for i, row in test_df.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    if user_id in user_ids and movie_id in movie_ids:
        new_movie_id = movie_id_map[movie_id]
        pred_rating = np.dot(user_matrix[user_id], product_matrix[new_movie_id].T)
        test_df.at[i, 'rating'] = pred_rating




  0%|          | 0/100 [00:00<?, ?it/s]

In [2]:
test_df['movieId'] = test_df['movieId'].astype (str)
test_df['userId']= test_df['userId'].astype (str)
test_df['userId_movieId'] = test_df['userId'] + '_' + test_df['movieId']
cols = ['userId_movieId', 'rating']
test_df = test_df[cols]

In [3]:
test_df.shape

(30002, 2)

In [None]:
# Create an empty dataframe to store predictions
pred_df = pd.DataFrame(columns = ['userId', 'movieId', 'rating'])

# Iterate through all combinations of userIds and movieIds in the test dataframe
for user_id in test_user_ids:
    for movie_id in test_movie_ids:
        # Check if the user and movie are present in the train set
        if user_id in user_ids and movie_id in movie_ids:
            new_movie_id = movie_id_map[movie_id]
            pred_rating = np.dot(user_matrix[user_id_map[user_id]], product_matrix[new_movie_id].T)
            new_row = pd.DataFrame({'userId': [user_id], 'movieId': [movie_id], 'rating': [pred_rating]})
            pred_df = pd.concat([pred_df, new_row], ignore_index=True)


In [None]:
pred_df.to_csv('submission_ALS_first_attempt.csv', index = False)