# Collaborative Filtering Project
## Intro to Machine Learning
### Thomas Cazort
---

## Setup:

In [190]:
from collections import defaultdict
from scipy.stats import pearsonr
import numpy as np
import math
import statistics
from sklearn.metrics import mean_squared_error

Store similarities in dictionary SIM (This is like a sparse-matrix where we only store non-zero values)

Store ratings in ITM:

In [191]:
SIM = defaultdict(dict)
ITM = defaultdict(dict)
userMovies = defaultdict(list)

ITM[*m*][*u*] stores rating score for movie *m* and user *u*

SIM[*m1*][*m2*] stores similarity score between movie *m* and *m1*

userMovies[*u*] lists all movies reviewed by user *u*



In [192]:
ifile = open("exampledataset/data.txt")
for l in ifile:
    parts = l.strip().split(",")
    ITM[int(parts[0])][int(parts[1])] = float(parts[2])
    userMovies[int(parts[1])] += [int(parts[0])]
ifile.close()

### Similarity Computation:

Compute similarity between *i* and *j* and store this value in SIM[*i*][*j*]

I will be using the correlation-coefficient formula described in class.

In [193]:
for i in ITM.keys():
    for j in ITM.keys():
        if i==j:
            continue
        # riBar and rjBar:
        riBar = statistics.mean(list(ITM[i].values()))
        rjBar = statistics.mean(list(ITM[j].values()))
        # SUM u e U:
        numer = denomP1 = denomP2 = 0
        for ui in ITM[i].keys():
            for uj in ITM[j].keys():
                if ui != uj:
                    continue
                rui, ruj = ITM[i][ui], ITM[j][uj]
                # Compute the Numerator of the Equation:
                numer += (rui - riBar) * (ruj - rjBar)
                # First part of Denominator:
                denomP1 += (rui - riBar)**2
                # Second part:
                denomP2 += (ruj - rjBar)**2
        # Compute simmilarity:
        denom = math.sqrt(denomP1) * math.sqrt(denomP2)
        
        wij = round((numer / denom), 2) if denom != 0 else 0
        # Add to SIM:
        SIM[i][j] = wij
# SIM

## Testing:

### K Neighbors:

Find K Neighbors for movie from the weights store in SIM:

In [194]:
testMovies = defaultdict(dict)
ifile = open("exampledataset/predictions.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
    # print(sorted(SIM[movie], key=SIM[movie].get, reverse=True))
    # KNN5[movie] = sorted(SIM[movie], key=SIM[movie].get, reverse=True)[:5]
ifile.close()
# testMovies

### Prediction:

Predict the rating by user using user's ratings for the K neighbors:

In [197]:
predRatings = defaultdict(dict)
numer = denom = 0

# Movie i to be predicted
for i in testMovies.keys():
    # print("Test movie", i, testMovies[i].keys())
    # print("SIM:", SIM[i])
    # print("Sorted SIM:", sorted(SIM[i], key=SIM[i].get, reverse=True))
    
    # User a
    for a in testMovies[i].keys():
        # print("User a:", a)
        # print("UserMovies:", userMovies[a])
        KNN5 = defaultdict(list)
        knnIter, k, = 0, 30
        for m in sorted(SIM[i], key=SIM[i].get, reverse=True):
            if m in userMovies[a]:
                # print("m", m)
                KNN5[i] += [m]
                knnIter += 1
            else:
                continue
            if knnIter >= k:
                break
        # print("KNN:", KNN5[i])
        for j in KNN5[i]:
            raj = ITM[j][a]
            numer += raj * SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 2) if denom != 0 else 0
        pai = 0 if pai < 0 else pai
        pai = 5 if pai > 5 else pai
        predRatings[i].update({a: pai})
        numer = denom = 0
# testMovies
predRatings

defaultdict(dict,
            {1: {2: 0.64, 4: 0, 5: 0, 7: 0, 8: 0, 10: 0.04, 12: 0},
             2: {1: 0, 2: 0, 5: 1.3, 6: 0, 8: 1.79, 9: 0},
             3: {3: 0, 6: 1.73, 8: 0, 12: 0},
             4: {1: 0, 4: 2.02, 6: 0, 7: 0.56, 9: 0, 10: 0, 12: 3.98},
             5: {1: 0, 2: 0, 7: 0.34, 8: 1.78, 9: 0, 10: 0},
             6: {2: 1.95, 4: 0, 6: 2.11, 7: 0.01, 9: 4.51, 10: 0.86, 12: 0}})

### MSE:

Compute the Mean-Squared Error between the true and predicted ratings:

In [198]:
mse = []
for i in predRatings.keys():
    y_true, y_pred = list(testMovies[i].values()), list(predRatings[i].values())
    mse += [mean_squared_error(y_true, y_pred)]
sum(mse) / len(mse)

0.0