# Collaborative Filtering Project
## Intro to Machine Learning
### Thomas Cazort
---

## Setup:

In [363]:
from collections import defaultdict
from scipy.stats import pearsonr
import numpy as np
import math
import statistics
from sklearn.metrics import mean_squared_error

Store similarities in dictionary SIM (This is like a sparse-matrix where we only store non-zero values)

Store ratings in ITM:

In [364]:
SIM = defaultdict(dict)
ITM = defaultdict(dict)
userMovies = defaultdict(list)

ITM[*m*][*u*] stores rating score for movie *m* and user *u*

SIM[*m1*][*m2*] stores similarity score between movie *m* and *m1*

userMovies[*u*] lists all movies reviewed by user *u*



In [365]:
ifile = open("netflix-small/ratings-train.txt")
for l in ifile:
    parts = l.strip().split(",")
    ITM[int(parts[0])][int(parts[1])] = float(parts[2])
    userMovies[int(parts[1])] += [int(parts[0])]
ifile.close()

### Similarity Computation:

Compute similarity between *i* and *j* and store this value in SIM[*i*][*j*]

I will be using the correlation-coefficient formula described in class.

In [366]:
for i in ITM.keys():
    for j in ITM.keys():
        if i==j:
            continue
        # riBar and rjBar:
        riBar = statistics.mean(list(ITM[i].values()))
        rjBar = statistics.mean(list(ITM[j].values()))
        # SUM u e U:
        numer = denomP1 = denomP2 = 0
        for ui in ITM[i].keys():
            for uj in ITM[j].keys():
                if ui != uj:
                    continue
                rui, ruj = ITM[i][ui], ITM[j][uj]
                # Compute the Numerator of the Equation:
                numer += (rui - riBar) * (ruj - rjBar)
                # First part of Denominator:
                denomP1 += (rui - riBar)**2
                # Second part:
                denomP2 += (ruj - rjBar)**2
        # Compute simmilarity:
        denom = math.sqrt(denomP1) * math.sqrt(denomP2)
        
        wij = round((numer / denom), 2) if denom != 0 else 0
        # Add to SIM:
        SIM[i][j] = wij
# SIM

## Testing:

### K Neighbors:

Find K Neighbors for movie from the weights store in SIM:

In [367]:
testMovies = defaultdict(dict)
ifile = open("netflix-small/ratings-test.txt")
for l in ifile:
    parts = l.strip().split(",")
    movie = int(parts[0])
    user = int(parts[1])
    truerating = float(parts[2])
    testMovies[movie].update({user: truerating})
ifile.close()
# testMovies

### Prediction:

Predict the rating by user using user's ratings for the K neighbors:

In [368]:
predRatings = defaultdict(dict)
numer = denom = 0

# Movie i to be predicted
for i in testMovies.keys():
    
    # User a
    for a in testMovies[i].keys():
        if not userMovies[a]:
            predRatings[i].update({a: -1})
            continue
        KNN5 = defaultdict(list)
        knnIter, k, = 0, 5
        for m in sorted(SIM[i], key=SIM[i].get, reverse=True):
            if m in userMovies[a]:
                # print("m", m)
                KNN5[i] += [m]
                knnIter += 1
            else:
                continue
            if knnIter >= k:
                break
        for j in KNN5[i]:
            raj = ITM[j][a]
            numer += raj * SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 2) if denom != 0 else 2.5
        pai = 0 if pai < 0 else pai
        pai = 5 if pai > 5 else pai
        predRatings[i].update({a: pai})
        numer = denom = 0

### MSE:

Compute the Mean-Squared Error between the true and predicted ratings:

In [369]:
mse = []
y_true, y_pred = [], []
for i in predRatings.keys():
    for j in predRatings[i].keys():
        if predRatings[i][j] != -1:
            y_true += [testMovies[i][j]]
            y_pred += [predRatings[i][j]]
print("K = 5 MSE:", mean_squared_error(y_true, y_pred))

K = 5 MSE: 1.7648415492957747


### Analysis:

This MSE is pretty high, but it was very low on the small example dataset. This leads me to believe that because the range of values is so low (1-5) and the true results are only ever integers, this result isn't actually that bad.

This was tested with K = 5. Lets test with some more values of K.

### K = 10:

In [370]:
predRatings = defaultdict(dict)
numer = denom = 0

# Movie i to be predicted
for i in testMovies.keys():
    
    # User a
    for a in testMovies[i].keys():
        if not userMovies[a]:
            predRatings[i].update({a: -1})
            continue
        KNN10 = defaultdict(list)
        knnIter, k, = 0, 10
        for m in sorted(SIM[i], key=SIM[i].get, reverse=True):
            if m in userMovies[a]:
                # print("m", m)
                KNN10[i] += [m]
                knnIter += 1
            else:
                continue
            if knnIter >= k:
                break
        for j in KNN10[i]:
            raj = ITM[j][a]
            numer += raj * SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 2) if denom != 0 else 2.5
        pai = 0 if pai < 0 else pai
        pai = 5 if pai > 5 else pai
        predRatings[i].update({a: pai})
        numer = denom = 0
        
        
mse = []
y_true, y_pred = [], []
for i in predRatings.keys():
    for j in predRatings[i].keys():
        if predRatings[i][j] != -1:
            y_true += [testMovies[i][j]]
            y_pred += [predRatings[i][j]]
print("K = 10 MSE:", mean_squared_error(y_true, y_pred))

K = 10 MSE: 1.7447676056338028


### K = 15:

In [371]:
predRatings = defaultdict(dict)
numer = denom = 0

# Movie i to be predicted
for i in testMovies.keys():
    
    # User a
    for a in testMovies[i].keys():
        if not userMovies[a]:
            predRatings[i].update({a: -1})
            continue
        KNN15 = defaultdict(list)
        knnIter, k, = 0, 15
        for m in sorted(SIM[i], key=SIM[i].get, reverse=True):
            if m in userMovies[a]:
                # print("m", m)
                KNN15[i] += [m]
                knnIter += 1
            else:
                continue
            if knnIter >= k:
                break
        for j in KNN15[i]:
            raj = ITM[j][a]
            numer += raj * SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 2) if denom != 0 else 2.5
        pai = 0 if pai < 0 else pai
        pai = 5 if pai > 5 else pai
        predRatings[i].update({a: pai})
        numer = denom = 0
        
        
mse = []
y_true, y_pred = [], []
for i in predRatings.keys():
    for j in predRatings[i].keys():
        if predRatings[i][j] != -1:
            y_true += [testMovies[i][j]]
            y_pred += [predRatings[i][j]]
print("K = 15 MSE:", mean_squared_error(y_true, y_pred))

K = 15 MSE: 1.7465457746478872


### K = 25:

In [372]:
predRatings = defaultdict(dict)
numer = denom = 0

# Movie i to be predicted
for i in testMovies.keys():
    
    # User a
    for a in testMovies[i].keys():
        if not userMovies[a]:
            predRatings[i].update({a: -1})
            continue
        KNN25 = defaultdict(list)
        knnIter, k, = 0, 25
        for m in sorted(SIM[i], key=SIM[i].get, reverse=True):
            if m in userMovies[a]:
                # print("m", m)
                KNN25[i] += [m]
                knnIter += 1
            else:
                continue
            if knnIter >= k:
                break
        for j in KNN25[i]:
            raj = ITM[j][a]
            numer += raj * SIM[i][j]
            denom += abs(SIM[i][j])
        pai = round((numer/denom), 2) if denom != 0 else 2.5
        pai = 0 if pai < 0 else pai
        pai = 5 if pai > 5 else pai
        predRatings[i].update({a: pai})
        numer = denom = 0
        
        
mse = []
y_true, y_pred = [], []
for i in predRatings.keys():
    for j in predRatings[i].keys():
        if predRatings[i][j] != -1:
            y_true += [testMovies[i][j]]
            y_pred += [predRatings[i][j]]
print("K = 25 MSE:", mean_squared_error(y_true, y_pred))

K = 25 MSE: 1.7506288732394366
