# Collaborative Filtering Project
## Intro to Machine Learning
### Thomas Cazort
---

## Setup:

In [157]:
from collections import defaultdict
from scipy.stats import pearsonr
import numpy as np
import math
import statistics
from sklearn.metrics import mean_squared_error

Store similarities in dictionary SIM (This is like a sparse-matrix where we only store non-zero values)

Store ratings in ITM:

In [145]:
SIM = defaultdict(dict)
ITM = defaultdict(dict)

ITM[*m*][*u*] stores rating score for movie *m* and user *u*

SIM[*m1*][*m2*] stores similarity score between movie *m* and *m1*

In [146]:
ifile = open("netflix-small/ratings-train.txt")
for l in ifile:
    parts = l.strip().split(",")
    ITM[int(parts[0])][int(parts[1])] = float(parts[2])
ifile.close()

### Similarity Computation:

Compute similarity between *i* and *j* and store this value in SIM[*i*][*j*]

I will be using the correlation-coefficient formula described in class.

In [147]:
for i in ITM.keys():
    for j in ITM.keys():
        if i==j:
            continue
        # riBar and rjBar:
        riBar = statistics.mean(list(ITM[i].values()))
        rjBar = statistics.mean(list(ITM[j].values()))
        # SUM u e U:
        numer = denomP1 = denomP2 = 0
        for ui in ITM[i].keys():
            for uj in ITM[j].keys():
                if ui != uj:
                    continue
                rui, ruj = ITM[i][ui], ITM[j][uj]
                # Compute the Numerator of the Equation:
                numer += (rui - riBar) * (ruj - rjBar)
                # First part of Denominator:
                denomP1 += (rui - riBar)**2
                # Second part:
                denomP2 += (ruj - rjBar)**2
        # Compute simmilarity:
        denom = math.sqrt(denomP1) * math.sqrt(denomP2)
        
        wij = round((numer / denom), 4) if denom != 0 else 0
        # Add to SIM:
        SIM[i][j] = wij

## Testing:

### K Neighbors:

Find K Neighbors for movie from the weights store in SIM:

In [155]:
KNN5 = {}
testMovies = defaultdict(dict)
ifile = open("netflix-small/ratings-test.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
    KNN5[movie] = sorted(SIM[movie], key=SIM[movie].get, reverse=True)[:5]
ifile.close()


### Prediction:

Predict the rating by user using user's ratings for the K neighbors:

In [171]:
predRatings = defaultdict(dict)
numer = denom = 0
for i in testMovies.keys():
    for a in testMovies[i].keys():
        for j in KNN5[i]:
            if a not in ITM[j]:
                continue
            raj = ITM[j][a]
            numer += raj + SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 4) if denom != 0 else 0
        predRatings[i].update({a: pai})
# predRatings
# testMovies

### MSE:

Compute the Mean-Squared Error between the true and predicted ratings:

In [173]:
mse = []
for i in predRatings.keys():
    y_true, y_pred = list(testMovies[i].values()), list(predRatings[i].values())
    mse += [mean_squared_error(y_true, y_pred)]
sum(mse) / len(mse)

2.384974692636075

In [174]:
KNN10 = {}
testMovies = defaultdict(dict)
ifile = open("netflix-small/ratings-test.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
    KNN10[movie] = sorted(SIM[movie], key=SIM[movie].get, reverse=True)[:10]
ifile.close()

predRatings = defaultdict(dict)
numer = denom = 0
for i in testMovies.keys():
    for a in testMovies[i].keys():
        for j in KNN10[i]:
            if a not in ITM[j]:
                continue
            raj = ITM[j][a]
            numer += raj + SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 4) if denom != 0 else 0
        predRatings[i].update({a: pai})
        
mse = []
for i in predRatings.keys():
    y_true, y_pred = list(testMovies[i].values()), list(predRatings[i].values())
    mse += [mean_squared_error(y_true, y_pred)]
sum(mse) / len(mse)

2.7330447871764125

In [None]:
KNN15 = {}
testMovies = defaultdict(dict)
ifile = open("netflix-small/ratings-test.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
    KNN15[movie] = sorted(SIM[movie], key=SIM[movie].get, reverse=True)[:10]
ifile.close()

predRatings = defaultdict(dict)
numer = denom = 0
for i in testMovies.keys():
    for a in testMovies[i].keys():
        for j in KNN10[i]:
            if a not in ITM[j]:
                continue
            raj = ITM[j][a]
            numer += raj + SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 4) if denom != 0 else 0
        predRatings[i].update({a: pai})
        
mse = []
for i in predRatings.keys():
    y_true, y_pred = list(testMovies[i].values()), list(predRatings[i].values())
    mse += [mean_squared_error(y_true, y_pred)]
sum(mse) / len(mse)