In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'alx-movie-recommendation-project-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F81285%2F8778365%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240621%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240621T225431Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3a20b6a60d79249249102eb9857593f5af5a06e3121c8f02e372f6c76c869cb9d3afd0745ebb7597914ab500ab82ae8545709850945bd5042caa0737c0fe68c3a02315a899e14e44f33bb559f4d7ac7e2e9e75b84a11e9ea3e461530b43838e201e68e58336b48979facf5c9c004cd73c2639dee0ebbedbcb195ebfd8a3d95276e4b3046cec206d382e742e087f56b0d19badbfd2484747bcb50ca1e13dd03a55d1eee6bbd9d80c876f784b6a98eb80d24706266de6b3a61dbb6ed410756e77f05d14325b334846a72ef9aa891f40160a30190d164f8339cc8e132f3a36df48244119de42cbff7e4e7d4e38c160ac7578b863b78c7e9c899182d21c249adfb8b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading alx-movie-recommendation-project-2024, 250788671 bytes compressed
Downloaded and uncompressed: alx-movie-recommendation-project-2024
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alx-movie-recommendation-project-2024/train.csv
/kaggle/input/alx-movie-recommendation-project-2024/movies.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv
/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv
/kaggle/input/alx-movie-recommendation-project-2024/tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv
/kaggle/input/alx-movie-recommendation-project-2024/links.csv
/kaggle/input/alx-movie-recommendation-project-2024/test.csv


In [None]:
train_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/train.csv')
test_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/test.csv')
submission_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv')
links_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/links.csv')
tags_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/tags.csv')
movies_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/movies.csv')
genome_scores_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv')
genome_tags_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv')
imdb_data_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv')


In [None]:
print(movies_df.head())
print(train_df.head())
print(test_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating   timestamp
0    5163    57669     4.0  1518349992
1  106343        5     4.5  1206238739
2  146790     5459     5.0  1076215539
3  106362    32296     2.0  1423042565
4    9041      366     3.0   833375837
   userId  movieId
0       1     2011
1       1     4144
2       1     5767
3       1     6711
4       1     7318


In [None]:
print("Train: ")
print(str(train_df.isnull().sum()))
print("************")
print("Test: ")
print(str(test_df.isnull().sum()))
print("************")
print("Movies: ")
print(str(movies_df.isnull().sum()))
print("************")
print("Links: ")
print(str(links_df.isnull().sum()))
print("************")
print("IMDB: ")
print(str(imdb_data_df.isnull().sum()))
print("************")
print("Genome scores: ")
print(str(genome_scores_df.isnull().sum()))
print("************")
print("Genome tags: ")
print(str(genome_tags_df.isnull().sum()))

Train: 
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
************
Test: 
userId     0
movieId    0
dtype: int64
************
Movies: 
movieId    0
title      0
genres     0
dtype: int64
************
Links: 
movieId      0
imdbId       0
tmdbId     107
dtype: int64
************
IMDB: 
movieId              0
title_cast       10068
director          9874
runtime          12089
budget           19372
plot_keywords    11078
dtype: int64
************
Genome scores: 
movieId      0
tagId        0
relevance    0
dtype: int64
************
Genome tags: 
tagId    0
tag      0
dtype: int64


In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357241 sha256=fab5a6c3615c0108073c207c34a91cd5405e6ac014918319477a1a0e587ebe5b
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [None]:
from surprise import Reader, Dataset, SVD, accuracy
from sklearn.model_selection import train_test_split

In [None]:
# train_df
#reader = Reader(rating_scale=(1,5))
#train_data = Dataset.load_from_df(train_df[['userId','movieId','rating']],reader)
#trainset = train_data.build_full_trainset()

#ORIGINAL CODE

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)

# Split data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)
train_data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
val_data = Dataset.load_from_df(val_data[['userId', 'movieId', 'rating']], reader)

trainset = train_data.build_full_trainset()
valset = val_data.build_full_trainset().build_testset()

In [None]:
svd = SVD()
svd.fit(trainset)

# test_user_predict_pairs = list(zip(test_df['userId'],test_df['movieId']))
# test_predict = [svd.predict(uid,mid).est
# for uid,mid in test_user_predict_pairs]

# test_df['predicted_rating'] = test_predict
# print(test_df.head())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f12cf6b7f10>

In [None]:
# Predict ratings for validation set
predictions = svd.test(valset)

# Calculate RMSE for the validation set
rmse = accuracy.rmse(predictions)
# Calculate MAE for the validation set
mae = accuracy.mae(predictions)
# Calculate MSE for the validation set
mse = accuracy.mse(predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")

RMSE: 0.8343
MAE:  0.6332
MSE: 0.6960
RMSE: 0.8342516210254515
MAE: 0.6331835071219826
MSE: 0.6959757671835936


In [None]:
#def predict_ratings(test_df,model):
  #predictions= []
  #for row in test_df.itertuples():
    #user_id = row.userId
    #movie_id = row.movieId
    #prediction = model.predict(user_id,movie_id).est
    #predictions.append((user_id,movie_id,prediction))
    #predictions_df = pd.DataFrame(predictions,columns=['user_id','movie_id','predict_ratings'])
  #return predictions_df



In [None]:

#test_subset= test_df.sample(n=1000)
#predictions_df = predict_ratings(test_subset,svd)


In [None]:
#predictions_df = predict_ratings(test_df, svd)

In [None]:
from tqdm import tqdm


def predict_ratings(test_df, model):
    predictions = []
    # Wrap the iteration with tqdm to show the progress bar
    for row in tqdm(test_df.itertuples(), total=len(test_df), desc="Predicting ratings"):
        user_id = row.userId
        movie_id = row.movieId
        prediction = model.predict(user_id, movie_id).est
        predictions.append((user_id, movie_id, prediction))
    # Create the DataFrame outside the loop to avoid repeated reassignments
    predictions_df = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'predicted_rating'])
    return predictions_df


#test_subset= test_df.sample(n=1000)
#predictions_df = predict_ratings(test_subset,svd)

# Assuming test_df and svd are already defined
predictions_df = predict_ratings(test_df, svd)


Predicting ratings: 100%|██████████| 5000019/5000019 [00:27<00:00, 183958.29it/s]


In [None]:

predictions_df['Id'] = predictions_df['user_id'].astype(str) + '_' + predictions_df['movie_id'].astype(str)
predictions_df['rating'] = predictions_df['predicted_rating']
predictions_df = predictions_df.drop(['user_id', 'movie_id', 'predicted_rating'], axis=1)
predictions_df.to_csv('1_Submission.csv', index=False)
print(predictions_df.head())

       Id    rating
0  1_2011  3.371827
1  1_4144  4.090105
2  1_5767  3.572697
3  1_6711  4.257370
4  1_7318  2.748647


In [None]:
from google.colab import files
files.download('1_Submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>