1. Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from scipy.sparse.linalg import svds


2. Load the Dataset

In [None]:
# Load ratings data
ratings = pd.read_csv('ratings_small.csv')

# Optional: Load movies data if needed
movies = pd.read_csv('movies.csv')


3. Data Cleaning

In [None]:
# Check for missing values
print(ratings.isnull().sum())

# Since there are no missing values in ratings_small.csv, we can proceed
# If there were missing values, we could handle them like this:
# ratings['rating'].fillna(ratings['rating'].mean(), inplace=True)


4. Exploratory Data Analysis (EDA)

In [None]:
# Overview of the dataset
print(ratings.head())

# Ratings distribution
sns.histplot(ratings['rating'], bins=5, kde=False)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Number of ratings per user
ratings_per_user = ratings.groupby('userId')['rating'].count()
sns.histplot(ratings_per_user, bins=30, kde=False)
plt.title('Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Users')
plt.show()

# Number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['rating'].count()
sns.histplot(ratings_per_movie, bins=30, kde=False)
plt.title('Number of Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Count of Movies')
plt.show()


In [None]:
5. Baseline Model

# Compute the global mean rating
global_mean = ratings['rating'].mean()
print(f'Global Mean Rating: {global_mean}')

# Create baseline predictions
ratings['baseline_pred'] = global_mean

# Calculate RMSE for the baseline model
baseline_rmse = np.sqrt(mean_squared_error(ratings['rating'], ratings['baseline_pred']))
print(f'Baseline RMSE: {baseline_rmse}')


6. Train-Test Split

In [None]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)


7. K-NN Model

7.1 Prepare the Data

In [None]:
# Create a pivot table for user-item interactions
user_item_matrix = train_data.pivot_table(index='userId', columns='movieId', values='rating')

# Fill NaN with zeros for distance calculations
user_item_matrix_filled = user_item_matrix.fillna(0)


7.2 Implement K-NN

In [None]:
# Fit the NearestNeighbors model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix_filled)

# Function to predict ratings
def knn_predict(user_id, movie_id, k=5):
    if movie_id not in user_item_matrix_filled.columns:
        # Movie not in training data
        return global_mean
    if user_id not in user_item_matrix_filled.index:
        # User not in training data
        return global_mean
    
    distances, indices = knn_model.kneighbors(
        user_item_matrix_filled.loc[user_id].values.reshape(1, -1), n_neighbors=k+1)
    
    similarities = 1 - distances.flatten()
    similarities = similarities[1:]  # Exclude the user itself
    indices = indices.flatten()[1:]
    
    neighbor_ratings = user_item_matrix_filled.iloc[indices][movie_id]
    mask = neighbor_ratings > 0
    if mask.sum() == 0:
        return global_mean
    predicted_rating = np.dot(similarities[mask], neighbor_ratings[mask]) / similarities[mask].sum()
    return predicted_rating


7.3 Evaluate the K-NN Model

In [None]:
# Predict ratings for the test set
test_users = test_data['userId'].values
test_movies = test_data['movieId'].values
knn_predictions = []

for user, movie in zip(test_users, test_movies):
    pred = knn_predict(user, movie, k=5)
    knn_predictions.append(pred)

# Calculate RMSE
knn_rmse = np.sqrt(mean_squared_error(test_data['rating'], knn_predictions))
print(f'K-NN Model RMSE: {knn_rmse}')


8. SVD Model

8.1 Prepare the Data

In [None]:
# Create a user-item rating matrix
R = train_data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0).values


8.2 Compute SVD

In [None]:
# Mean center the data
R_mean = np.mean(R, axis=1)
R_demeaned = R - R_mean.reshape(-1, 1)

# Perform SVD
U, sigma, Vt = svds(R_demeaned, k=50)  # Choose k latent factors
sigma = np.diag(sigma)


8.3 Reconstruct the Ratings Matrix

In [None]:
# Reconstruct the approximated ratings matrix
R_pred = np.dot(np.dot(U, sigma), Vt) + R_mean.reshape(-1, 1)


8.4 Evaluate the SVD Model

In [None]:
# Convert the reconstructed matrix back to DataFrame
predictions_df = pd.DataFrame(R_pred, index=user_item_matrix.index, columns=user_item_matrix.columns)

def svd_predict(user_id, movie_id):
    try:
        return predictions_df.loc[user_id, movie_id]
    except:
        return global_mean

# Predict ratings for the test set
svd_predictions = []

for user, movie in zip(test_users, test_movies):
    pred = svd_predict(user, movie)
    svd_predictions.append(pred)

# Calculate RMSE
svd_rmse = np.sqrt(mean_squared_error(test_data['rating'], svd_predictions))
print(f'SVD Model RMSE: {svd_rmse}')


9. Cross-Validation and Hypothesis Testing

9.1 Cross-Validation

We'll perform 5-fold cross-validation on both models.

K-NN Cross-Validation

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
knn_rmse_scores = []

for train_indices, test_indices in kf.split(ratings):
    train_cv = ratings.iloc[train_indices]
    test_cv = ratings.iloc[test_indices]
    
    # Prepare user-item matrix
    user_item_matrix_cv = train_cv.pivot_table(index='userId', columns='movieId', values='rating')
    user_item_matrix_filled_cv = user_item_matrix_cv.fillna(0)
    
    # Fit K-NN model
    knn_model_cv = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model_cv.fit(user_item_matrix_filled_cv)
    
    # Predict and evaluate
    test_users_cv = test_cv['userId'].values
    test_movies_cv = test_cv['movieId'].values
    knn_predictions_cv = []
    
    for user, movie in zip(test_users_cv, test_movies_cv):
        pred = knn_predict(user, movie, k=5)
        knn_predictions_cv.append(pred)
    
    rmse = np.sqrt(mean_squared_error(test_cv['rating'], knn_predictions_cv))
    knn_rmse_scores.append(rmse)

print(f'K-NN Cross-Validation RMSE: {np.mean(knn_rmse_scores)} ± {np.std(knn_rmse_scores)}')


SVD Cross-Validation

In [None]:
svd_rmse_scores = []

for train_indices, test_indices in kf.split(ratings):
    train_cv = ratings.iloc[train_indices]
    test_cv = ratings.iloc[test_indices]
    
    # Prepare user-item matrix
    R_cv = train_cv.pivot_table(index='userId', columns='movieId', values='rating').fillna(0).values
    
    # Mean center the data
    R_mean_cv = np.mean(R_cv, axis=1)
    R_demeaned_cv = R_cv - R_mean_cv.reshape(-1, 1)
    
    # Perform SVD
    U_cv, sigma_cv, Vt_cv = svds(R_demeaned_cv, k=50)
    sigma_cv = np.diag(sigma_cv)
    
    # Reconstruct the ratings matrix
    R_pred_cv = np.dot(np.dot(U_cv, sigma_cv), Vt_cv) + R_mean_cv.reshape(-1, 1)
    predictions_df_cv = pd.DataFrame(R_pred_cv, index=user_item_matrix.index, columns=user_item_matrix.columns)
    
    # Predict and evaluate
    svd_predictions_cv = []
    
    for user, movie in zip(test_cv['userId'], test_cv['movieId']):
        pred = predictions_df_cv.loc[user, movie] if movie in predictions_df_cv.columns else global_mean
        svd_predictions_cv.append(pred)
    
    rmse = np.sqrt(mean_squared_error(test_cv['rating'], svd_predictions_cv))
    svd_rmse_scores.append(rmse)

print(f'SVD Cross-Validation RMSE: {np.mean(svd_rmse_scores)} ± {np.std(svd_rmse_scores)}')


9.2 Hypothesis Testing
We will perform a paired t-test to see if the advanced models significantly outperform the baseline.

In [None]:
from scipy.stats import ttest_rel

# Baseline predictions on test set
baseline_predictions = np.full(len(test_data), global_mean)

# Calculate errors
baseline_errors = (test_data['rating'] - baseline_predictions) ** 2
knn_errors = (test_data['rating'] - knn_predictions) ** 2
svd_errors = (test_data['rating'] - svd_predictions) ** 2

# Paired t-test between baseline and K-NN
t_stat_knn, p_value_knn = ttest_rel(baseline_errors, knn_errors)
print(f'K-NN vs Baseline t-statistic: {t_stat_knn}, p-value: {p_value_knn}')

# Paired t-test between baseline and SVD
t_stat_svd, p_value_svd = ttest_rel(baseline_errors, svd_errors)
print(f'SVD vs Baseline t-statistic: {t_stat_svd}, p-value: {p_value_svd}')


Interpretation:

Null Hypothesis: There is no significant difference between the model's RMSE and the baseline RMSE.
Alternative Hypothesis: The model's RMSE is significantly lower than the baseline RMSE.
If the p-value is less than 0.05, we reject the null hypothesis.

10. Hybrid Recommendation System
When user data is limited, we'll recommend:

3 most popular movies
2 personalized recommendations

10.1 Most Popular Movies

In [None]:
# Get the top 3 most rated movies
popular_movies = ratings.groupby('movieId').size().sort_values(ascending=False).head(3).index.tolist()

# Map movie IDs to titles
popular_movie_titles = movies[movies['movieId'].isin(popular_movies)]['title'].tolist()
print('Top 3 Popular Movies:')
for title in popular_movie_titles:
    print(title)


10.2 Personalized Recommendations
Assuming we have a new user with limited data.

In [None]:
def hybrid_recommendations(user_id, k=5):
    # If user is new, recommend popular movies
    if user_id not in user_item_matrix.index:
        return popular_movie_titles
    
    # Get personalized recommendations
    user_predictions = predictions_df.loc[user_id].sort_values(ascending=False)
    recommended_movie_ids = user_predictions.index[:k]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()
    return recommended_movies

# Example usage:
new_user_id = 9999  # New user ID
recommendations = hybrid_recommendations(new_user_id)
print('Recommendations for New User:')
for rec in recommendations:
    print(rec)


11. Analysis
Compare the performance of the models.

In [None]:
print(f'Baseline RMSE: {baseline_rmse}')
print(f'K-NN Model RMSE: {knn_rmse}')
print(f'SVD Model RMSE: {svd_rmse}')


Observation: Check which model has the lowest RMSE.
Conclusion: Determine if the advanced models significantly outperform the baseline.

12. Conclusion
We have:

Implemented a baseline model and two advanced models (K-NN and SVD).
Evaluated their performance using RMSE.
Performed hypothesis testing to validate our models.
Built a hybrid recommendation system for users with limited data.
