# Preparation

In [1]:
import shutil
shutil.unpack_archive('/content/Files (8).zip')

In [2]:
import pandas as pd
movies = pd.read_csv('/content/Files/movies.csv')
print(movies.columns)

Index(['mID', 'title', 'year', 'Doc', 'Com', 'Hor', 'Adv', 'Wes', 'Dra', 'Ani',
       'War', 'Chi', 'Cri', 'Thr', 'Sci', 'Mys', 'Rom', 'Fil', 'Fan', 'Act',
       'Mus'],
      dtype='object')


In [3]:
users = pd.read_csv('/content/Files/users.csv')
print(users.columns)

Index(['uID', 'gender', 'age', 'accupation', 'zip'], dtype='object')


## Load Train and Test Data

In [4]:
# Load train and test data
train_data = pd.read_csv('/content/Files/train.csv')
test_data = pd.read_csv('/content/Files/test.csv')

## Combine Data
Combine the 'movies', 'users', 'train_data', and 'test_data' dataframes to create a unified dataframe that includes information about users, movies, and their ratings.

In [5]:
# Merge movies and train_data on 'mID'
train_data = pd.merge(train_data, movies, on='mID', how='left')

# Merge users and train_data on 'uID'
train_data = pd.merge(train_data, users, on='uID', how='left')

# Merge movies and test_data on 'mID'
test_data = pd.merge(test_data, movies, on='mID', how='left')

# Merge users and test_data on 'uID'
test_data = pd.merge(test_data, users, on='uID', how='left')


## Create User-Item Matrix
Create a user-item matrix where rows represent users, columns represent movies, and the values represent ratings. This matrix will be used for matrix factorization.

In [6]:
# Create user-item matrix for training data
user_item_matrix_train = train_data.pivot(index='uID', columns='mID', values='rating').fillna(0)

# Create user-item matrix for test data
user_item_matrix_test = test_data.pivot(index='uID', columns='mID', values='rating').fillna(0)


## Apply Matrix Factorization

In [8]:
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# Specify the number of latent factors
n_latent_factors = 10

# Initialize NMF model
nmf_model = NMF(n_components=n_latent_factors, init='random', random_state=42)
#nmf_model = NMF(n_components=n_latent_factors, init='random', max_iter=300, random_state=42)

# Fit the model to the user-item matrix for training data
W_train = nmf_model.fit_transform(user_item_matrix_train)
H_train = nmf_model.components_

# Predict the missing values in the user-item matrix for test data
predicted_ratings_test = np.dot(W_train, H_train)

# Get the non-zero elements for both predicted and actual arrays
nonzero_indices_test_rows, nonzero_indices_test_cols = np.where(user_item_matrix_test.values != 0)
predicted_ratings_nonzero_test = predicted_ratings_test[nonzero_indices_test_rows, nonzero_indices_test_cols]
actual_ratings_nonzero_test = user_item_matrix_test.values[nonzero_indices_test_rows, nonzero_indices_test_cols]

# Flatten the matrices for comparison
predicted_ratings_nonzero_test_flattened = predicted_ratings_nonzero_test.flatten()
actual_ratings_nonzero_test_flattened = actual_ratings_nonzero_test.flatten()

# Calculate RMSE for test data
rmse_test = sqrt(mean_squared_error(actual_ratings_nonzero_test_flattened, predicted_ratings_nonzero_test_flattened))

# Display RMSE for test data
print(f"RMSE for Test Data: {rmse_test}")

RMSE for Test Data: 3.4997546920962517


### Discuss the results and why sklearn's non-negative matrix facorization library did not work well compared to simple baseline or similarity-based methods we’ve done in Module 3. Can you suggest a way(s) to fix it

Answer: The relatively high RMSE value of 3.4997 indicates that the matrix factorization model did not perform well compared to simpler baseline or similarity-based methods from module 3.One way to potentially improve performance is to explore incorporating additional features or enhancing the model's hyperparameters.