# Collaborative Filtering Project
## Intro to Machine Learning
### Thomas Cazort
---

## Setup:

In [58]:
from collections import defaultdict
from scipy.stats import pearsonr
import numpy as np
import math
import statistics

Store similarities in dictionary SIM (This is like a sparse-matrix where we only store non-zero values)

Store ratings in ITM:

In [59]:
SIM = defaultdict(dict)
ITM = defaultdict(dict)

ITM[*m*][*u*] stores rating score for movie *m* and user *u*

SIM[*m1*][*m2*] stores similarity score between movie *m* and *m1*

In [60]:
ifile = open("netflix-small/ratings-train.txt")
for l in ifile:
    parts = l.strip().split(",")
    ITM[int(parts[0])][int(parts[1])] = float(parts[2])
ifile.close()

### Similarity Computation:

Compute similarity between *i* and *j* and store this value in SIM[*i*][*j*]

I will be using the correlation-coefficient formula described in class.

In [71]:
for i in ITM.keys():
    for j in ITM.keys():
        if i==j:
            continue
        # riBar and rjBar:
        riBar = statistics.mean(list(ITM[i].values()))
        rjBar = statistics.mean(list(ITM[j].values()))
        # SUM u e U:
        numer = denomP1 = denomP2 = 0
        for ui in ITM[i].keys():
            for uj in ITM[j].keys():
                if ui != uj:
                    continue
                rui, ruj = ITM[i][ui], ITM[j][uj]
                # Compute the Numerator of the Equation:
                numer += (rui - riBar) * (ruj - rjBar)
                # First part of Denominator:
                denomP1 += (rui - riBar)**2
                # Second part:
                denomP2 += (ruj - rjBar)**2
        # Compute simmilarity:
        denom = math.sqrt(denomP1) * math.sqrt(denomP2)
        
        wij = round((numer / denom), 4) if denom != 0 else 0
        # Add to SIM:
        SIM[i][j] = wij

## Testing:

### K Neighbors:

Find K Neighbors for movie from the weights store in SIM:

In [92]:
KNN5 = {}
testMovies = defaultdict(dict)
ifile = open("netflix-small/ratings-test.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
    KNN5[movie] = sorted(SIM[movie], key=SIM[movie].get, reverse=True)[:5]
ifile.close()

### Prediction:

Predict the rating by user using user's ratings for the K neighbors:

In [88]:
predRatings = defaultdict(dict)
numer = denom = 0
for i in KNN5.keys():
    for a in testMovies[i].keys():
        for j in KNN5[i]:
            print(j)
            raj = ITM[j][a]
            numer += raj + SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 4) if denom != 0 else 0
        predRatings[i].update({a: pai})
predRatings    

9
7
31
29
10
9


KeyError: 395

### MSE:

Compute the Mean-Squared Error between the true and predicted ratings:

In [90]:
ITM[9]

{1511: 2.0,
 2146: 3.0,
 1235: 3.0,
 2160: 1.0,
 10: 1.0,
 982: 2.0,
 570: 4.0,
 2043: 1.0,
 176: 2.0,
 2569: 1.0,
 1139: 2.0,
 1819: 2.0,
 851: 3.0,
 2486: 1.0,
 1466: 1.0,
 1473: 1.0,
 2530: 2.0,
 1091: 1.0,
 1084: 1.0,
 1400: 2.0,
 1631: 2.0,
 2536: 1.0,
 2351: 1.0,
 1981: 1.0,
 615: 2.0,
 1888: 2.0,
 2223: 1.0,
 1547: 3.0,
 630: 2.0,
 1592: 2.0,
 345: 2.0,
 1198: 2.0,
 584: 1.0,
 1263: 1.0,
 989: 2.0,
 2031: 1.0,
 328: 2.0,
 762: 2.0,
 956: 2.0,
 1907: 1.0,
 1123: 2.0,
 2495: 3.0,
 2353: 1.0,
 1758: 5.0,
 1377: 4.0,
 1386: 1.0,
 150: 3.0,
 1252: 2.0,
 2457: 1.0,
 252: 1.0,
 640: 1.0,
 674: 1.0,
 1893: 1.0,
 2564: 1.0,
 1603: 1.0,
 1710: 1.0,
 743: 2.0,
 1488: 1.0,
 123: 3.0,
 2507: 3.0,
 2492: 2.0,
 2082: 1.0,
 2035: 1.0,
 1191: 2.0,
 1141: 2.0,
 1082: 2.0,
 2: 2.0,
 837: 1.0,
 2505: 2.0,
 1340: 1.0}