### Imports

In [1]:
from fancyimpute import SoftImpute
import numpy as np
import pandas as pd
import random
from sklearn.metrics import mean_squared_error


### Helper Functions

In [8]:
def load_dataset(file_path, name):
    if name == "ml-100k":
        col_names = ["user_id", "item_id", "rating", "timestamp"]
        df = pd.read_csv(file_path, sep="\t", header=None, names=col_names)
    else:
        col_names = ["user_id", "item_id", "rating", "timestamp"]
        df = pd.read_csv(file_path, sep="::", header=None, names=col_names, engine="python")
    return df

def create_matrix(df):
    num_users = df["user_id"].nunique()
    num_items = df["item_id"].nunique()
    matrix = np.zeros((num_users, num_items))

    user_mapping = {user_id: idx for idx, user_id in enumerate(df["user_id"].unique())}
    item_mapping = {item_id: idx for idx, item_id in enumerate(df["item_id"].unique())}

    for row in df.itertuples():
        matrix[user_mapping[row[1]], item_mapping[row[2]]] = row[3]

    return matrix

def remove_half_entries(matrix):
    non_zero_indices = np.nonzero(~np.isnan(matrix))
    non_zero_count = len(non_zero_indices[0])
    to_remove_count = non_zero_count // 2

    removal_indices = random.sample(range(non_zero_count), to_remove_count)
    removed_entries = []

    for idx in removal_indices:
        row = non_zero_indices[0][idx]
        col = non_zero_indices[1][idx]
        removed_entries.append((row, col, matrix[row, col]))
        matrix[row, col] = np.nan

    return matrix, removed_entries

def soft_impute(matrix, max_iters=100, convergence_threshold=0.001):
    imputer = SoftImpute(max_iters=max_iters, convergence_threshold=convergence_threshold)
    return imputer.fit_transform(matrix)

def load_matrix_from_csv(file_path):
    df = pd.read_csv(file_path, header=0, index_col=0)
    df = df.replace("NA", np.nan)
    return df

def save_matrix_to_csv(matrix, file_path):
    matrix.to_csv(file_path)


### Main

In [3]:
CONFIG = {
    "max_iters": 100,
    "seed": 123,
    "dataset": 'ml-100k' # "ml-100k" or "ml-1m"
}

In [4]:
if CONFIG["dataset"] == 'ml-100k':
    data_file = "./ml-100k/u.data"
else:
    data_file = "./ml-1m/ratings.dat"
df = load_dataset(data_file, CONFIG["dataset"])
random.seed(CONFIG["seed"])
user_item_matrix = create_matrix(df)
masked_matrix, removed_entries = remove_half_entries(user_item_matrix.copy())

completed_matrix = soft_impute(masked_matrix, CONFIG["max_iters"])

ground_truth = np.array([entry[2] for entry in removed_entries])
imputed_values = np.array([completed_matrix[entry[0], entry[1]] for entry in removed_entries])

mse = mean_squared_error(ground_truth, imputed_values)
print("MSE between imputed missing half and ground truth missing half:", mse)

[SoftImpute] Max Singular Value of X_init = 322.477917
[SoftImpute] Iter 1: observed MAE=0.077896 rank=648
[SoftImpute] Iter 2: observed MAE=0.080451 rank=594
[SoftImpute] Iter 3: observed MAE=0.082215 rank=563
[SoftImpute] Iter 4: observed MAE=0.083527 rank=539
[SoftImpute] Iter 5: observed MAE=0.084540 rank=525
[SoftImpute] Iter 6: observed MAE=0.085383 rank=513
[SoftImpute] Iter 7: observed MAE=0.086076 rank=503
[SoftImpute] Iter 8: observed MAE=0.086656 rank=495
[SoftImpute] Iter 9: observed MAE=0.087150 rank=489
[SoftImpute] Iter 10: observed MAE=0.087572 rank=484
[SoftImpute] Iter 11: observed MAE=0.087931 rank=479
[SoftImpute] Iter 12: observed MAE=0.088248 rank=475
[SoftImpute] Iter 13: observed MAE=0.088523 rank=473
[SoftImpute] Iter 14: observed MAE=0.088762 rank=469
[SoftImpute] Iter 15: observed MAE=0.088969 rank=467
[SoftImpute] Iter 16: observed MAE=0.089152 rank=464
[SoftImpute] Iter 17: observed MAE=0.089308 rank=462
[SoftImpute] Iter 18: observed MAE=0.089449 rank=461


### Impute Feedback

In [9]:
matrix = load_matrix_from_csv('./Feedback.csv')
imputed_matrix = soft_impute(matrix)
imputed_matrix = np.round(imputed_matrix).astype(int)
completed_matrix =pd.DataFrame(imputed_matrix, index=matrix.index, columns=matrix.columns)
save_matrix_to_csv(completed_matrix, './imputedFeedback.csv')

[SoftImpute] Max Singular Value of X_init = 42.723211
[SoftImpute] Iter 1: observed MAE=0.144944 rank=15
[SoftImpute] Iter 2: observed MAE=0.144931 rank=15
[SoftImpute] Iter 3: observed MAE=0.144926 rank=15
[SoftImpute] Iter 4: observed MAE=0.144895 rank=15
[SoftImpute] Iter 5: observed MAE=0.144873 rank=15
[SoftImpute] Iter 6: observed MAE=0.144848 rank=15
[SoftImpute] Iter 7: observed MAE=0.144845 rank=15
[SoftImpute] Iter 8: observed MAE=0.144891 rank=15
[SoftImpute] Iter 9: observed MAE=0.144927 rank=15
[SoftImpute] Iter 10: observed MAE=0.144945 rank=15
[SoftImpute] Iter 11: observed MAE=0.144272 rank=14
[SoftImpute] Iter 12: observed MAE=0.143061 rank=14
[SoftImpute] Iter 13: observed MAE=0.142581 rank=14
[SoftImpute] Iter 14: observed MAE=0.142406 rank=14
[SoftImpute] Iter 15: observed MAE=0.142348 rank=14
[SoftImpute] Iter 16: observed MAE=0.142289 rank=14
[SoftImpute] Iter 17: observed MAE=0.142208 rank=14
[SoftImpute] Iter 18: observed MAE=0.142156 rank=14
[SoftImpute] Iter 1