# Collaborative Filtering Project
## Intro to Machine Learning
### Thomas Cazort
---

## Setup:

In [215]:
from collections import defaultdict
from scipy.stats import pearsonr
import numpy as np
import math
import statistics
from sklearn.metrics import mean_squared_error
import pandas as pd

Store similarities in dictionary SIM (This is like a sparse-matrix where we only store non-zero values)

Store ratings in ITM:

In [216]:
SIM = defaultdict(dict)
ITM = defaultdict(dict)
userMovies = defaultdict(list)

ITM[*m*][*u*] stores rating score for movie *m* and user *u*

SIM[*m1*][*m2*] stores similarity score between movie *m* and *m1*

userMovies[*u*] lists all movies reviewed by user *u*

In [228]:
ifile = open("netflix-small/ratings-train.txt")
for l in ifile:
    parts = l.strip().split(",")
    ITM[int(parts[0])][int(parts[1])] = float(parts[2])
    userMovies[int(parts[1])] += [int(parts[0])]
ifile.close()

### Similarity Computation:

Compute similarity between *i* and *j* and store this value in SIM[*i*][*j*]

I will be using the correlation-coefficient formula described in class.

In [219]:
for i in ITM.keys():
    for j in ITM.keys():
        if i==j:
            continue
        # riBar and rjBar:
        riBar = statistics.mean(list(ITM[i].values()))
        rjBar = statistics.mean(list(ITM[j].values()))
        # SUM u e U:
        numer = denomP1 = denomP2 = 0
        for ui in ITM[i].keys():
            for uj in ITM[j].keys():
                if ui != uj:
                    continue
                rui, ruj = ITM[i][ui], ITM[j][uj]
                # Compute the Numerator of the Equation:
                numer += (rui - riBar) * (ruj - rjBar)
                # First part of Denominator:
                denomP1 += (rui - riBar)**2
                # Second part:
                denomP2 += (ruj - rjBar)**2
        # Compute simmilarity:
        denom = math.sqrt(denomP1) * math.sqrt(denomP2)
        
        wij = round((numer / denom), 4) if denom != 0 else 0
        # Add to SIM:
        SIM[i][j] = wij
SIMdf = pd.DataFrame.from_dict(SIM)
SIMdf
ITMdf = pd.DataFrame.from_dict(ITM)
ITMdf
m = SIMdf.columns[ITMdf.isin([788]).any()]
# df.columns[df.isin(['APPLE']).any()]
# SIMdf

## Testing:

### K Neighbors:

Find K Neighbors for movie from the weights store in SIM:

In [252]:
KNN5 = {}
testMovies = defaultdict(dict)
ifile = open("netflix-small/ratings-test.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
    KNN5[movie] = sorted(SIM[movie], key=SIM[movie].get, reverse=True)[:5]
ifile.close()
testMovies[33]
# SIM
# userMovies

{1864: 4.0,
 1172: 4.0,
 1756: 5.0,
 1249: 4.0,
 450: 5.0,
 2246: 5.0,
 86: 3.0,
 2271: 3.0,
 63: 4.0,
 1901: 5.0,
 1939: 4.0,
 1004: 3.0,
 1552: 4.0,
 240: 3.0,
 1753: 4.0,
 512: 5.0,
 1844: 4.0,
 1731: 5.0,
 860: 4.0,
 2237: 4.0,
 116: 5.0}

### Prediction:

Predict the rating by user using user's ratings for the K neighbors:

In [244]:
predRatings = defaultdict(dict)
numer = denom = 0
for i in testMovies.keys():
    tempMovies = {}
    for a in testMovies[i].keys():
        # Intersect movies that user a has seen with movies similar
        keep = [value for value in list(SIM[i].keys()) if value in userMovies[a]]
        SIMa = {k: SIM[k] for k in keep}
        print(SIMa.keys())
        print(i)
        if i not in SIMa.keys():
            continue
        KNN5[i] = sorted(SIMa[i], key=SIMa[i].get, reverse=True)[:5]
        print(i, KNN5[i])
        for j in KNN5[i]:
            raj = ITM[j][a]
            numer += raj + SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 4) if denom != 0 else 0
        predRatings[i].update({a: pai})
# predRatings

dict_keys([9, 11, 1, 8, 5, 7, 23, 31, 34, 21, 2, 14, 17, 3, 24, 6, 27, 29, 15, 10, 19, 30, 0, 16])
25
dict_keys([])
25
dict_keys([])
25
dict_keys([])
25
dict_keys([])
25
dict_keys([33, 18, 12, 9, 11, 8, 5, 7, 32, 23, 31, 26, 34, 21, 13, 2, 3, 4, 6, 27, 29, 22, 15, 10, 19, 30, 28, 16])
25
dict_keys([])
33
dict_keys([26, 13, 10, 0])
33
dict_keys([32, 30])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([13, 17])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([26, 17, 27, 22, 10, 19, 28])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
33
dict_keys([])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([9, 11, 1, 8, 5, 7, 23, 31, 34, 21, 2, 14, 17, 3, 24, 6, 27, 29, 15, 10, 19, 30, 0, 16])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([])
18
dict_keys([8, 15])
18
dict_keys

### MSE:

Compute the Mean-Squared Error between the true and predicted ratings:

In [177]:
mse = []
for i in predRatings.keys():
    y_true, y_pred = list(testMovies[i].values()), list(predRatings[i].values())
    mse += [mean_squared_error(y_true, y_pred)]
sum(mse) / len(mse)

2.384974692636075