# ML Project 3- Suraj Jayakumar

# 1) Collaborative Filtering on netflix dataset

## Step1: Import training ratings and create the correlation matrix

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv("TrainingRatings.txt", names=['movieID', 'userID', 'rating'])
df_filtered = df[df['rating'] != 0]
user_counts = df_filtered.groupby('userID').size().values
user_rating_sums = df.groupby('userID')['rating'].sum().values
average_ratings = np.divide(user_rating_sums, user_counts, out=np.zeros_like(user_rating_sums), where=user_counts!=0)

# Create a user-movie rating matrix
user_movie_matrix = df.pivot(index='userID', columns='movieID', values='rating').fillna(0).values

# Center the ratings by subtracting the average rating for each user
centered_ratings = user_movie_matrix - average_ratings[:, np.newaxis]
centered_ratings[user_movie_matrix==0]=0

#computing the denominator
squared_sum = np.einsum('ij,ij->i', centered_ratings, centered_ratings)
rooted_squared_sum = np.sqrt(squared_sum)
denominator=rooted_squared_sum*rooted_squared_sum.T
#computing the numerator
numerator = np.einsum('ij,kj->ik', centered_ratings, centered_ratings)
with np.errstate(divide='ignore', invalid='ignore'):
        similarity_matrix=numerator/denominator
        similarity_matrix[denominator == 0] = 0  # Set undefined correlations to 0

# Set diagonal to 0
np.fill_diagonal(similarity_matrix, 0)

## Step 2: create a prediction matrix using the user's average rating and summing it with the weighted ratings of other users for each movie

In [3]:
#removing undefined values from the correlation matrix
similarity_matrix=np.nan_to_num(similarity_matrix)
centered_ratings=np.nan_to_num(centered_ratings)

In [25]:
prediction_matrix = average_ratings[:, np.newaxis] + 1e-5*np.dot(similarity_matrix, centered_ratings) 
# 10^-5 multiplied for normalization: obtained empirically in this case via hyperparameter tuning

## Step 3: Find mean absolute error and root mean squared error by testing the predictions on user and movie combinations in the testing dataset 

In [27]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
df_test = pd.read_csv("TestingRatings.txt", names=['movieID', 'userID', 'rating'])
user_id_to_index = {user_id: index for index, user_id in enumerate(df.groupby('userID').size().index)}
# Get predictions for test ratings
predictions = []
actual_ratings = []
counter=0;

for _, row in df_test.iterrows():
    movie_id = row['movieID']
    user_id = row['userID']
    if user_id in user_id_to_index:
      user_index = user_id_to_index[user_id]
      movie_index = np.where(df['movieID'].unique() == movie_id)[0]
      if movie_index.size > 0:
          prediction = prediction_matrix[user_index, movie_index[0]]
          predictions.append(prediction)
          actual_ratings.append(row['rating'])



# Evaluate the predictions
mae=mean_absolute_error(actual_ratings, predictions)
mse = mean_squared_error(actual_ratings, predictions)
rmse = np.sqrt(mse)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 0.7883677257380779
Root Mean Squared Error (RMSE): 0.9856831548353928


# 2) Scikit-learn's SVM classifier on the MNIST Dataset

In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X / 255.
# rescale the data, use the traditional train/test split
# (60K: Train) and (10K: Test)
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]
#tuning hyperparameters which are kernel function and penalty

## 2.1) The regularization parameter C and the kernel type whether linear, gaussian or polynomial is varied. 
## 2.2) In the case of gaussian kernel we vary the gamma parameter which affects the curvature of the decision boundary

In [None]:
from sklearn.svm import SVC


for kernel in ['linear', 'rbf','poly']:
  for c in [0.1, 1, 10, 100]:
    #get error rate for every combination of parameters
    if kernel=='rbf' and c==1:
        for gamma in [0.01,1,100]:
            svc = SVC(kernel=kernel, C=c, gamma=gamma)
            svc.fit(X_train, y_train)
            y_pred = svc.predict(X_test)
            error_rate = np.mean(y_pred != y_test)
            print("Error rate for kernel {}, C {} and gamma {}: {}".format(kernel, c, gamma,error_rate))
        continue
    svc = SVC(kernel=kernel, C=c)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    error_rate = np.mean(y_pred != y_test)
    print("Error rate for kernel {} and C {}: {}".format(kernel, c, error_rate))

Error rate for kernel linear and C 0.1: 0.0528
Error rate for kernel linear and C 1: 0.0596
Error rate for kernel linear and C 10: 0.069
Error rate for kernel linear and C 100: 0.0742
Error rate for kernel rbf and C 0.1: 0.0405
Error rate for kernel rbf, C 1 and gamma 0.01: 0.0231


# 3) Scikit-learn's KNN classifier on the MNIST Dataset

## 3.1) The number of neighbours considered is varied- 3, 10, 20.  
## 3.2) different distance metrics are used for calculating weights.
## 3.3) Algorithm used is varied whether it be brute force, Ball Tree, or KD- Tree
## 3.4) Extreme cases of n=1 and n=50

In [2]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
parameters = [
    #varying number of nearest neighbours taken into account with uniform weights
    {'n_neighbors': 3, 'weights': 'uniform'},
    {'n_neighbors': 10, 'weights': 'uniform'},
    {'n_neighbors': 20, 'weights': 'uniform'},

    #varying distance metric
    {'n_neighbors': 3, 'weights': 'distance', 'metric': 'cityblock'},
    {'n_neighbors': 3, 'weights': 'distance', 'metric': 'euclidean'},
    {'n_neighbors': 3, 'weights': 'distance', 'metric': 'cosine'},
    
    #varying algorithm used
    {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'brute' },
    {'n_neighbors': 3, 'weights': 'distance', 'algorithm': 'ball_tree'},

    #trying extremely high and low nearest neighbour values
    {'n_neighbors': 1, 'weights': 'uniform'},
    {'n_neighbors': 50, 'weights': 'uniform'},
    
    
]


for param in parameters:
  knn = KNeighborsClassifier(**param)
  knn.fit(X_train, y_train)
  y_pred = knn.predict(X_test)
  error_rate = np.mean(y_pred != y_test)
  print(f"Error rate for parameters {param}: {error_rate}")

Error rate for parameters {'n_neighbors': 1, 'weights': 'uniform'}: 0.0309
Error rate for parameters {'n_neighbors': 50, 'weights': 'uniform'}: 0.0466
