In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

ModuleNotFoundError: No module named 'surprise'

In [2]:
pip install surprise

Collecting surprise
  Obtaining dependency information for surprise from https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl.metadata
  Using cached surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): f

  error: subprocess-exited-with-error
  
  Building wheel for scikit-surprise (pyproject.toml) did not run successfully.
  exit code: 1
  
  [117 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-311
  creating build\lib.win-amd64-cpython-311\surprise
  copying surprise\accuracy.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\builtin_datasets.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\dataset.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\dump.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\reader.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\trainset.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\utils.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\__init__.py -> build\lib.win-amd64-cpython-311\surprise
  copying surprise\__main__.py -> build\lib.win-amd64-cpython-311\surp

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
movies_df = pd.read_csv('movies.csv')
genome_scores_df = pd.read_csv('genome_scores.csv')
genome_tags_df = pd.read_csv('genome_tags.csv')
imdb_data_df = pd.read_csv('imdb_data.csv')
links_df = pd.read_csv('links.csv')
tags_df = pd.read_csv('tags.csv')

In [None]:
# Merge genome_scores with genome_tags
genome_scores_df = genome_scores_df.merge(genome_tags_df, on='tagId')

# Pivot genome_scores_df to have tag relevance as columns
genome_pivot_df = genome_scores_df.pivot(index='movieId', columns='tag', values='relevance').reset_index()

# Merge with movies_df
movies_df = movies_df.merge(genome_pivot_df, on='movieId', how='left')

# Merge links_df with imdb_data_df
imdb_merged_df = links_df.merge(imdb_data_df, left_on='imdbId', right_on='imdb_id', how='left')

# Merge with movies_df
movies_df = movies_df.merge(imdb_merged_df, on='movieId', how='left')

# Aggregate tags for each movie
tags_agg_df = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Merge with movies_df
movies_df = movies_df.merge(tags_agg_df, on='movieId', how='left')

# Convert genres to dummy variables
genres_df = movies_df['genres'].str.get_dummies(sep='|')

# Concatenate genres_df with movies_df
movies_df = pd.concat([movies_df, genres_df], axis=1)

# Drop columns that are not needed or redundant
movies_df.drop(columns=['imdb_id', 'tmdbId', 'title', 'genres'], inplace=True)

# Merge train_df with movies_df to include movie features
train_merged_df = train_df.merge(movies_df, on='movieId', how='left')

# Create a reader object for surprise
reader = Reader(rating_scale=(0.5, 5.0))

# Load the dataset into surprise
data = Dataset.load_from_df(train_merged_df[['userId', 'movieId', 'rating']], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize the SVD algorithm
svd = SVD()

# Train the algorithm on the trainset
svd.fit(trainset)

# Test the algorithm on the testset
predictions = svd.test(testset)

# Compute and print Root Mean Square Error
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

# Function to predict ratings for the test set
def predict_rating(user_id, movie_id, model):
    return model.predict(user_id, movie_id).est

# Generate predictions for the test set
test_df['rating'] = test_df.apply(lambda x: predict_rating(x['userId'], x['movieId'], svd), axis=1)

# Prepare the submission file
test_df['Id'] = test_df['userId'].astype(str) + '_' + test_df['movieId'].astype(str)
submission_df = test_df[['Id', 'rating']]
submission_df.to_csv('submission.csv', index=False)