In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens/movies.csv
/kaggle/input/movielens/ratings.csv
/kaggle/input/movielens/README.txt
/kaggle/input/movielens/tags.csv
/kaggle/input/movielens/links.csv


In [2]:
import pandas as pd

# Load MovieLens 100k data
file_path = '/kaggle/input/movielens/ratings.csv'
movie_lens_data = pd.read_csv(file_path)

# Omit the timestamp column
ratings = movie_lens_data[['userId', 'movieId', 'rating']]

In [3]:
from sklearn.model_selection import train_test_split

# Split the data into train and test parts (80% and 20%)
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Display the shapes of the train and test sets
print("Train set shape:", train_data.shape)
print("Test set shape:", test_data.shape)

Train set shape: (838860, 3)
Test set shape: (209715, 3)


In [4]:
# Create dense tables from train and test sets
training_data = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
testing_data = test_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Display the first few rows of the training_data
print("First few rows of the training data:")
print(training_data.head())

First few rows of the training data:
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     3.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  129235  129303  129350  129354  129428  129707  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     4.0  ...     0.0 

In [5]:
training_data = train_data.pivot(index='userId', columns='movieId', values='rating')

In [6]:
global_mean = training_data.stack().mean()
print("Global Mean of Ratings in the Training Data:", global_mean)

Global Mean of Ratings in the Training Data: 3.5285226378656747


In [7]:
users_bias = {}
for user_id in training_data.index:
    user_ratings = training_data.loc[user_id].dropna()
    sum_ratings = user_ratings.sum()
    num_ratings = len(user_ratings)
    bu = (sum_ratings - num_ratings * global_mean) / (0.99 + num_ratings)
    users_bias[user_id] = bu

# Display the first few entries of users' bias
print("Users' Bias:")
print(list(users_bias.items())[:5])

Users' Bias:
[(1, 0.19155139157958415), (2, 0.366327091149271), (3, 0.5088666277028573), (4, 0.17739993655392602), (5, 0.7702509226553298)]


In [8]:
items_bias = {}
for item_id in training_data.columns:
    item_ratings = training_data[item_id].dropna()
    sum_ratings = item_ratings.sum()
    num_ratings = len(item_ratings)
    bi = (sum_ratings - num_ratings * global_mean) / (0.99 + num_ratings)
    items_bias[item_id] = bi

# Display the first few entries of items' bias
print("Items' Bias:")
print(list(items_bias.items())[:5])

Items' Bias:
[(1, 0.4484826389628273), (2, -0.2784949032871909), (3, -0.3608503885668626), (4, -0.47228298434824195), (5, -0.42528761017005734)]


In [None]:
# Find the indices of missing values
missing_indices = np.where(pd.isna(training_data))

# Convert the indices to tuples
missing_indices = list(zip(missing_indices[0], missing_indices[1]))

# Calculate the filled values using NumPy array indexing
filled_values = global_mean + np.array([users_bias.get(user_id, 0) + items_bias.get(item_id, 0) for user_id, item_id in missing_indices])

# Replace missing values with the calculated filled values
for (user_id, item_id), filled_value in zip(missing_indices, filled_values):
    filled_training_data.at[user_id, item_id] = filled_value


In [None]:
import numpy as np

# Apply SVD on the training data
U, S, V = np.linalg.svd(filled_training_data, full_matrices=False)

# Display the shapes of U, S, and V
print("Shape of U:", U.shape)
print("Shape of S:", S.shape)
print("Shape of V:", V.shape)

In [None]:
# Specify the desired number of columns and rows to keep
Approx = 50  # Adjust this value based on your preference

# Reduce U, S, V
U = U[:, :Approx]
V = V[:Approx, :]
S = np.diag(S[:Approx])

# Display the shapes of the reduced matrices
print("Shape of U (reduced):", U.shape)
print("Shape of S (reduced):", S.shape)
print("Shape of V (reduced):", V.shape)

In [None]:
# Calculate Z
Z = np.dot(U, np.dot(V, S))

# Display the shape of Z
print("Shape of Z:", Z.shape)

In [None]:
# Calculate MAE
mae = np.mean(np.abs(training_array - Z))
print("Mean Absolute Error (MAE):", mae)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import svd
from sklearn.metrics import mean_absolute_error

# Values of Approx to consider
approx_values = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

# Dictionary to store MAE for each configuration
mae_results = {}

# Redo steps 9 to 12 for each value of Approx
for approx in approx_values:
    # Reduce U, S, V
    U_approx = U[:, :approx]
    V_approx = V[:approx, :]
    S_approx = np.diag(S[:approx])

    # Calculate Z
    Z_approx = U_approx @ V_approx @ S_approx

    # Calculate MAE
    mae = mean_absolute_error(training_array, Z_approx)
    
    # Store the MAE for the current configuration
    mae_results[approx] = mae

# Plot the bar graph
plt.bar(mae_results.keys(), mae_results.values())
plt.xlabel('Approx')
plt.ylabel('MAE')
plt.title('MAE for Different Values of Approx')
plt.show()