# Task 7

## Imports

In [1]:
!pip install scikit-surprise
import pandas as pd
from surprise import accuracy, Dataset, SVD, SVDpp, NMF
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/154.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357257 sha256=981d9d69bdd5c1b99236ca57a6197401fad32ece4ffe281b17f39e00fd754147
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df

## Load Data

In [2]:
data = Dataset.load_builtin(name = 'ml-100k' , prompt = False)

Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


## Training

In [3]:
%%time
data = Dataset.load_builtin('ml-100k')
algorithms = {'SVD': SVD(), 'SVDpp': SVDpp(), 'NMF': NMF()}
cv = 5
measures = ['RMSE', 'MAE']

results = {}
check_results = {}
trained_models = {}

# cross-validation for each algorithm
for name, algorithm in algorithms.items():
    result = cross_validate(algorithm, data, measures=measures, cv=cv, verbose=True)
    results[name] = pd.DataFrame.from_dict(result).mean(axis=0)

    # Training and testing for each algorithm
    trainset, testset = train_test_split(data, test_size=0.25)
    algorithm.fit(trainset)
    predictions = algorithm.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    check_results[name] = rmse

    # Train the algorithms on the entire dataset
    algorithm.fit(data.build_full_trainset())
    trained_models[name] = algorithm


# Generate predictions for all user-item combinations
all_predictions = {}
for name, algorithm in trained_models.items():
    predictions = algorithm.test(data.build_full_trainset().build_testset())
    all_predictions[name] = predictions

# creating a final DataFrame to compare the results
result_df = pd.DataFrame(results)
check_results_df = pd.DataFrame.from_dict(check_results, orient='index', columns=['RMSE_Check'])

print(result_df)
print()
print(check_results_df)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9332  0.9370  0.9321  0.9411  0.9357  0.9358  0.0032  
MAE (testset)     0.7370  0.7419  0.7330  0.7408  0.7382  0.7382  0.0031  
Fit time          1.67    1.53    2.76    2.60    4.66    2.64    1.12    
Test time         0.23    0.37    0.28    0.82    0.28    0.39    0.22    
RMSE: 0.9379
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9171  0.9266  0.9224  0.9120  0.9179  0.9192  0.0049  
MAE (testset)     0.7211  0.7253  0.7234  0.7179  0.7172  0.7210  0.0031  
Fit time          26.70   26.82   27.08   26.75   27.13   26.89   0.18    
Test time         5.46    4.44    5.32    4.67    5.01    4.98    0.38    
RMSE: 0.9232
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Me

## Check recommendations

In [4]:
%%time
def load_movie_list(filename):
    with open(filename, encoding='ISO-8859-1') as file:
        movies = file.readlines()
    movie_names = [movie.strip().split(' ', 1)[1] for movie in movies]
    return movie_names


def make_recommendations_for_user(predictions, user_id, num_recommendations):
    user_predictions = [pred for pred in predictions if pred.uid == str(user_id)]
    user_predictions.sort(key=lambda x: x.est, reverse=True)
    top_predictions = user_predictions[:num_recommendations]
    recommendations = [(pred.iid, pred.est) for pred in top_predictions]
    return recommendations

def predict_movies_based_on_movie(predictions, movie_id, movie_names, num_recommendations=5):
    movie_predictions = [pred for pred in predictions if pred.iid == str(movie_id)]
    movie_predictions.sort(key=lambda x: x.est, reverse=True)
    top_predictions = movie_predictions[:num_recommendations]
    recommendations = [(movie_names[int(pred.uid) - 1], pred.est) for pred in top_predictions]
    return recommendations

movie_ids_file = 'movie_ids.txt'
movie_names = load_movie_list(movie_ids_file)

# Make recommendations for a specific user
user_id = 100
num_recommendations = 7
for name, algorithm in algorithms.items():
  user_recommendations = make_recommendations_for_user(all_predictions[name], user_id, num_recommendations)
  print(f"\nRecommendations for user {user_id} [{name}]:")
  for movie_id, estimated_rating in user_recommendations:
      movie_name = movie_names[int(movie_id) - 1]
      print(f"Movie: {movie_name}, ID: {movie_id}, Estimated Rating: {estimated_rating:.2f}")

# Predict movies based on a specific movie
movie_id = 316
movie_name_ = "As Good As It Gets (1997)"
# num_recommendations = 10
for name, algorithm in algorithms.items():
  movie_recommendations = predict_movies_based_on_movie(all_predictions[name], movie_id, movie_names, num_recommendations)
  print(f"\nMovies similar to '{movie_name_}' [{name}]:")
  for movie_name, estimated_rating in movie_recommendations:
      print(f"Movie: {movie_name}, Estimated Rating: {estimated_rating:.2f}")



Recommendations for user 100 [SVD]:
Movie: Titanic (1997), ID: 313, Estimated Rating: 4.37
Movie: Good Will Hunting (1997), ID: 272, Estimated Rating: 4.16
Movie: As Good As It Gets (1997), ID: 316, Estimated Rating: 4.01
Movie: L.A. Confidential (1997), ID: 302, Estimated Rating: 3.72
Movie: Game, The (1997), ID: 333, Estimated Rating: 3.71
Movie: Apt Pupil (1998), ID: 315, Estimated Rating: 3.61
Movie: Contact (1997), ID: 258, Estimated Rating: 3.60

Recommendations for user 100 [SVDpp]:
Movie: Good Will Hunting (1997), ID: 272, Estimated Rating: 4.29
Movie: Titanic (1997), ID: 313, Estimated Rating: 4.13
Movie: Contact (1997), ID: 258, Estimated Rating: 4.06
Movie: As Good As It Gets (1997), ID: 316, Estimated Rating: 3.95
Movie: L.A. Confidential (1997), ID: 302, Estimated Rating: 3.91
Movie: Seven Years in Tibet (1997), ID: 690, Estimated Rating: 3.86
Movie: Liar Liar (1997), ID: 294, Estimated Rating: 3.80

Recommendations for user 100 [NMF]:
Movie: As Good As It Gets (1997), ID

## Summary of Recommendation System Results

This analysis evaluated three recommendation system algorithms (SVD, SVD++, NMF) using the scikit-surprise library.


- SVD++ achieved the lowest RMSE and MAE, suggesting potentially better prediction accuracy. However, its training time is significantly higher compared to SVD and NMF.
- NMF had the highest RMSE and MAE, potentially indicating lower prediction accuracy.
- SVD offers a good balance between performance (decent RMSE and MAE) and training efficiency (fastest training time).

### Recommendations for Further Action

* **Additional User Testing:** While the metrics provide insights into algorithm performance, user testing is crucial, real users can check which recommendations they find most relevant and helpful.

* **Consider Application Needs:** If recommendation speed is a priority, SVD might be a good choice. If accuracy is critical and training time is less of a concern, SVD++ could be a potential candidate.

* **Explore Other Algorithms:** The scikit-surprise library offers other algorithms like ALS (Alternating Least Squares) and FunkSVD. Consider testing these to see if they outperform the evaluated ones in specific use cases.


## Part 2: Recomendation system from scratch

### Imports and functions

In [5]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from sklearn.metrics import mean_squared_error, mean_absolute_error

# def load_movie_list(filename):
#     with open(filename, encoding='ISO-8859-1') as file:
#         movies = file.readlines()
#     movie_names = [movie.strip().split(' ', 1)[1] for movie in movies]
#     return movie_names

def normalize_ratings(Y, R):
    Ymean = np.sum(Y, axis=1) / np.sum(R, axis=1)
    Ymean = np.nan_to_num(Ymean)  # Ensure no NaNs in Ymean
    Ynorm = Y - Ymean[:, None] * R
    return Ynorm, Ymean

def cofi_cost_func(params, Y, R, num_users, num_movies, num_features, lambda_=0.0):
    X = params[:num_movies * num_features].reshape(num_movies, num_features)
    Theta = params[num_movies * num_features:].reshape(num_users, num_features)

    J = (1 / 2) * np.sum((np.dot(X, Theta.T) * R - Y) ** 2)
    J += (lambda_ / 2) * (np.sum(Theta ** 2) + np.sum(X ** 2))

    X_grad = ((np.dot(X, Theta.T) * R - Y) @ Theta) + lambda_ * X
    Theta_grad = ((np.dot(X, Theta.T) * R - Y).T @ X) + lambda_ * Theta

    grad = np.concatenate([X_grad.ravel(), Theta_grad.ravel()])
    return J, grad

def gradient_descent(Y, R, num_users, num_movies, num_features, alpha=0.002, lambda_=0.02, iterations=1000):
    X = np.random.rand(num_movies, num_features)
    Theta = np.random.rand(num_users, num_features)
    params = np.concatenate([X.ravel(), Theta.ravel()])

    print('Gradient descent calculations:')
    for i in range(iterations):
        cost, grad = cofi_cost_func(params, Y, R, num_users, num_movies, num_features, lambda_)
        params -= alpha * grad
        if i % 100 == 0:
            print(f'Iteration {i}: cost = {cost}')

    X = params[:num_movies * num_features].reshape(num_movies, num_features)
    Theta = params[num_movies * num_features:].reshape(num_users, num_features)

    return X, Theta

def predict_ratings(X, Theta, Ymean):
    predictions = np.dot(X, Theta.T) + Ymean[:, None]
    # print(f"Predictions before clipping:\n{predictions}")
    return np.clip(predictions, 1, 5)


def make_recommendations(predicted_ratings, movie_names, user_id, num_recommendations):
    user_row = predicted_ratings[:, user_id - 1]
    sorted_indices = np.argsort(user_row)[::-1]
    top_indices = sorted_indices[:num_recommendations]
    recommendations = [(idx + 1, movie_names[idx], user_row[idx]) for idx in top_indices]
    return recommendations

def predict_movies(movie_name, movie_names, Y, R, num_recommendations=5):
    movie_index = movie_names.index(movie_name)
    movie_user_ratings = Y[movie_index]
    moviematrix = pd.DataFrame(Y, index=movie_names)
    similar_to_movie = moviematrix.T.corrwith(moviematrix.loc[movie_name])
    corr_movie = pd.DataFrame(similar_to_movie, columns=['correlation'])
    corr_movie.dropna(inplace=True)
    ratings_count = R.sum(axis=1)
    corr_movie['number of ratings'] = ratings_count
    predictions = corr_movie[corr_movie['number of ratings'] > 100].sort_values('correlation', ascending=False)
    return predictions.head(num_recommendations)

def calculate_rmse(Y, R, predicted_ratings):
    # Flatten arrays and filter only rated movies
    y_true = Y[R == 1]
    y_pred = predicted_ratings[R == 1]
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_mae(Y, R, predicted_ratings):
    # Flatten arrays and filter only rated movies
    y_true = Y[R == 1]
    y_pred = predicted_ratings[R == 1]
    return mean_absolute_error(y_true, y_pred)


## Training

In [6]:
%%time
# Load data
movie_ids_file = 'movie_ids.txt'
movie_names = load_movie_list(movie_ids_file)
movies_file = 'movies.mat'
data = loadmat(movies_file)
Y, R = data['Y'], data['R']

# Normalize ratings
Ynorm, Ymean = normalize_ratings(Y, R)

# Matrix Factorization
num_users, num_movies = Y.shape[1], Y.shape[0]
num_features = 10  # Number of latent features
X, Theta = gradient_descent(Ynorm, R, num_users, num_movies, num_features)

# Predict ratings
predicted_ratings = predict_ratings(X, Theta, Ymean)
predicted_ratings = np.clip(predicted_ratings, 1, 5)

# Calculate RMSE
rmse = calculate_rmse(Y, R, predicted_ratings)
print(f"RMSE of the predicted ratings: {rmse}")

# Calculate MAE
mae = calculate_mae(Y, R, predicted_ratings)
print(f"MAE of the predicted ratings: {mae}")


Gradient descent calculations:
Iteration 0: cost = 384220.4950939644
Iteration 100: cost = 30797.048345566127
Iteration 200: cost = 26936.25409685916
Iteration 300: cost = 25699.100499262022
Iteration 400: cost = 25078.00398556196
Iteration 500: cost = 24694.504631939253
Iteration 600: cost = 24435.451575076007
Iteration 700: cost = 24249.819101422163
Iteration 800: cost = 24109.60244047785
Iteration 900: cost = 23998.811863666397
RMSE of the predicted ratings: 0.6876052363190657
MAE of the predicted ratings: 0.528797810570718
CPU times: user 1min 21s, sys: 40.6 s, total: 2min 2s
Wall time: 1min 11s


## Check recommendations

In [7]:
user_id = 100
num_recommendations = 7
recommendations = make_recommendations(predicted_ratings, movie_names, user_id, num_recommendations)

# Display recommendations
print(f"\nRecommendations for user with id={user_id}:")
for position, movie_name, rating in recommendations:
    print(f"ID {position}: {movie_name}, Predicted Rating: {rating:.2f}")

# General movie recommendations based on a specific movie
movie_name_input = "As Good As It Gets (1997)"
movie_recommendations = predict_movies(movie_name_input, movie_names, Y, R, num_recommendations+1)
print(f"\nRecommendations based on the movie '{movie_name_input}':")
for idx, row in movie_recommendations.iloc[1:].iterrows():
    print(f"Movie: {idx}, Correlation: {row['correlation']:.2f}, Number of ratings: {row['number of ratings']:.0f}")



Recommendations for user with id=100:
ID 1315: Inventing the Abbotts (1997), Predicted Rating: 4.89
ID 1450: Golden Earrings (1947), Predicted Rating: 4.86
ID 721: Mallrats (1995), Predicted Rating: 4.77
ID 1209: Mixed Nuts (1994), Predicted Rating: 4.73
ID 1293: Star Kid (1997), Predicted Rating: 4.65
ID 61: Three Colors: White (1994), Predicted Rating: 4.61
ID 538: Anastasia (1997), Predicted Rating: 4.54

Recommendations based on the movie 'As Good As It Gets (1997)':
Movie: Apt Pupil (1998), Correlation: 0.59, Number of ratings: 160
Movie: Good Will Hunting (1997), Correlation: 0.50, Number of ratings: 198
Movie: Wag the Dog (1997), Correlation: 0.42, Number of ratings: 137
Movie: Titanic (1997), Correlation: 0.34, Number of ratings: 350
Movie: Tomorrow Never Dies (1997), Correlation: 0.32, Number of ratings: 180
Movie: Amistad (1997), Correlation: 0.30, Number of ratings: 124
Movie: L.A. Confidential (1997), Correlation: 0.29, Number of ratings: 297


## Summary for custom recommendation system:
*  The custom recommendation system works!

*  The custom recommendation system demonstrates a significantly lower RMSE and MAE compared to the Scikit-learn algorithms (SVD, SVDpp, and NMF), indicating a higher prediction accuracy.

* Trained faster (around 2 minutes) compared to scikit-learn's SVD++ (around 5 minutes). However, the custom system takes more time due to the iterative gradient descent process compared to other algirithms.

* The recommendations from the custom system also vary from those generated by the Scikit-learn algorithms, reflecting different underlying methods of user-item rating prediction.

* Calulated Estimated Rating in the custom recommendation system higher than in the scikit-learn

### Comparison Summary
The comparison between the two recommendation systems reveals the following:

1. **Overlap in Recommendations**:
   - Both systems recommend **Titanic (1997)** and **Good Will Hunting (1997)** for user 100.
   - The custom system includes unique movies such as "Inventing the Abbotts (1997)", "Golden Earrings (1947)", and "Mallrats (1995)" that are not found in the Surprise-based system's recommendations.

2. **Movies Similar to 'As Good As It Gets (1997)'**:
   - Common movies suggested across both systems include **Apt Pupil (1998)** and **Good Will Hunting (1997)**.
   - The Surprise-based system suggests classic and highly-rated older movies like **Streetcar Named Desire, A (1951)** and **Notorious (1946)**, while the custom system focuses more on correlation-based recommendations from the same era as "As Good As It Gets (1997)".

In summary, while there are overlaps in some recommendations, particularly for popular movies like "Titanic" and "Good Will Hunting", the custom recommendation system provides a unique set of movie suggestions for user 100, especially emphasizing movies that are less mainstream. Similarly, the recommendations based on similarity to "As Good As It Gets (1997)" highlight different methodologies: the Surprise-based system focuses on highly-rated classics, while the custom system suggests movies with strong correlation based on user ratings.