## Baseline Model

In [None]:
# FUNCTIONS FOR BASELINE PREDICION, STORING AND SUBMISSION

import numpy as np
import pandas as pd

def run_gradient_descent(n_epochs, batch_size, data, eta):
    
    data.reset_index(inplace = True, drop = True)
    
    # Unknown parameters
    a = np.zeros(n_users)  # User variations
    b = np.zeros(n_movies) # Movie variations
    
    mu = np.mean(data['rating'])
    predictions = np.zeros(len(data))
    n_batches = int(len(data)/batch_size)
    
    # Create indices for every batch
    # Of the form [-1, batch_size, 2*batchsize, ... len(data)]
    indices = [batch_size*x for x in range(n_batches)]
    indices[0] = -1
    indices.append(len(data) - 1)
            
    # Gradient Descent Begins
    print('Batch Size - ', batch_size, ', Eta - ', eta)
    exit = 0
    
    for epoch in range(n_epochs):
    
        # Display for every epoch - Epoch number and error
        print('Epoch-',(epoch+1),'/',n_epochs)
        err = np.mean(abs(data['rating'] - predictions));
        print('Error =',err)

        # For every mini-batch
        for batch_no in range(n_batches):

            # Select mini-batch
            batch = data.loc[indices[batch_no]+1:indices[batch_no+1]]

            # Display after every 10% completetion
            if (batch_no+1) % (int(n_batches/10)) == 0:
                print('Completetion - ', np.ceil((batch_no+1)/n_batches*100), '\b%')
                curr_err = (np.mean(abs(predictions[predictions!=0] - data.loc[(predictions!=0), 'rating'])))
                print(curr_err)
                if (curr_err == np.inf) or np.isnan(curr_err):
                    exit = 1
                    print('Breaking Out...')
                    break
       
            # Predictions for the selected mini-batch using the current a and b
            for index, x in batch.iterrows():
                predictions[index] = mu + a[int(x['userId'])] + b[int(x['movieId'])]

            # Updating a and b using the predictions on the current mini-batch
            for index, x in batch.iterrows():
                mu = mu + eta * (x['rating'] - predictions[index])
                a[int(x['userId'])] = a[int(x['userId'])] + eta * (x['rating'] - predictions[index])
                b[int(x['movieId'])] = b[int(x['movieId'])] + eta * (x['rating'] - predictions[index])
            
        if exit == 1:
            exit = 0
            print('Very large error - Moving to next search')
            break

    return (a, b, mu)

# To predict using baseline model
def baseline_predict(data, a, b, mu):
    
    data.reset_index(inplace = True, drop = True)
    
    predictions = {}
    
    for index, point in data.iterrows():
        predictions[index] = mu + a[int(point['userId'])] + b[int(point['movieId'])]
        
    return predictions

# To store baseline paramters for future use
def store_baseline_params(a, b, mu, param_filename):
    mu_vec = [np.nan]*(len(a))
    mu_vec[0] = mu
    params = {'a': a,
              'b': b,
              'mu': mu_vec}

    params = data.from_dict(params, orient = 'columns')
    params.to_csv(param_filename, index = False)
    
# Convert predictions to CSV
def submit(prediction, file_name):
    submission = test.from_dict(prediction, orient='index', columns=['Prediction'])
    submission[submission['Prediction'] < 0.5] = 0.5
    submission[submission['Prediction'] > 5] = 5
    submission.to_csv(file_name, index_label = 'Id')

In [None]:
# BASELINE HYPERPARAMTER SEARCH

# Import train files
filename = 'data/train.csv'
data = pd.read_csv(filename)

# Splitting into train and CV
indices = np.random.permutation(len(data))
train_length = int(len(data)*0.8)
train_data = data.loc[indices[0:train_length]]
test_data = data.loc[indices[(train_length+1):]]

n_movies = 10000
n_users = 10000

# Gradiet Descent Hyperparameters
n_epochs = 3

min_err = np.inf

for batch_size in [500, 750, 1000, 2000]:
    for eta in [0.0001, 0.0005, 0.001]:
        (a, b, mu) = run_gradient_descent(n_epochs, batch_size, train_data, eta)
        test_pred = predict(test_data, a, b, mu)
        error = np.mean(abs(test_data['rating'] - test_pred.values))
        if error < min_err:
            min_err = error
            selection = (batch_size, eta)

print(selection)

### Hyperparameter Choice
**Batch Size** = 750  
**Learning Rate** = 0.001

In [None]:
# BASELINE TRAINING, PREDICTION AND SUBMISSION

selection = (750, 0.001)

# Gradient Descent with 10 epochs
(a, b, mu) = run_gradient_descent(10, selection[0], train_data, selection[1])

# Store a, b, mu for future use
store_baseline_params(a, b, mu, 'Storage/Baseline_Params.csv')

# Import test files
filename = 'data/test.csv'
test = pd.read_csv(filename)

# Get predictions
baseline_prediction = predict(test, a, b, mu)

# SUBMISSION
submit(baseline_prediction, 'Submissions/Baseline.csv')

## Neighborhood Model

In [95]:
# NEIGHBORHOOD MODEL RATING MATRIX AND SIMILARITY MATRIX

# Hyperparameters for Neighborhood model
lambda2 = 100
k = 100

# Rating Matrix for faster operation
A = np.zeros([n_users, n_movies])
for ind, x in data.iterrows():
    A[int(x['userId']), int(x['movieId'])] = x['rating']
    
    # Display Status
    if(ind%500000 == 0):
        print(int(ind*100/len(data)), '\b%')

# Similarity Matrix of Shrunk Coefficients
S = np.zeros([n_movies, n_movies])
i = 1
for m1 in range(n_movies):
    for m2 in range(m1):
        # Find indices of common users between m1 and m2
        ind = (np.multiply(A[:, m1], A[:, m2]) != 0)
        
        # m1 ratings by common users
        x = A[ind, m1]
        n = len(x)
        
        # If less than 2 common elements, Pearson's correlation coefficient not defined
        if n > 1:
            # m2 ratings by common users
            y = A[ind, m2]
            
            # Correlation Coefficient
            p = np.corrcoef(x, y)[0,1]
            
            # Shrunk Coefficient - Symmetric Matrix
            S[m2, m1] = n/(n + lambda2)*p
            S[m1, m2] = n/(n + lambda2)*p
            
        # Display Status
        if (i % 10000) == 0:
            print(200*i/(n_movies*n_movies), '\b%')
        i = i + 1

# Store S for future use
def store_S(S, S_filename):
    cols = [str(e) for e in range(10000)]
    S_df = pd.DataFrame(S, columns=cols, index=cols)
    S_df.to_csv('shrunk_coeff.csv')
    
store_S(S, 'Storage/shrunk.csv')

0%
9%
18%
28%
37%
47%
56%
66%
75%
85%
94%


In [115]:
# This cell will be deleted before submission

# baseline_params = pd.read_excel('BaselineParameters.xlsx', index_col=None, header=None)
# a = baseline_params[0]
# b = baseline_params[1]
# mu = baseline_params.loc[0, 2]

# Convert predictions to CSV
def submit(prediction, file_name):
    submission = test.from_dict(prediction, orient='index', columns=['Prediction'])
    submission[submission['Prediction'] < 0.5] = 0.5
    submission[submission['Prediction'] > 5] = 5
    submission.to_csv(file_name, index_label = 'Id')

In [111]:
def neighborhood_predict(data, test, A, S, k, a, b, mu):
    
    prediction = {}

    movieIds = np.unique(data['movieId'])
    userIds = np.unique(data['userId'])

    # Matrix with a_ij = 1 if i has rated j, 0 otherwise
    binary_A = A.copy()
    binary_A[binary_A != 0] = 1

    # Predict for test
    for index, point in test.iterrows():

        movie = point['movieId']
        user = point['userId']

        # Number of movies rated by user
        n_rated = np.count_nonzero(binary_A[user, :])
        
        # New user
        if n_rated == 0:
            prediction[index] = mu + a[user] + b[movie]

        # Old user and old movie
        if n_rated > 0:
            
            # Similarity between movie and other movies rated by user
            S_movies = np.multiply(binary_A[user, :], S[movie, :])
            
            # Check if atleast k movies have been rated by user - if not select all movies rated
            kk = min(k, n_rated)
            k_ind = np.argpartition(-abs(S_movies), (kk - 1))[:kk]

            # Prediction
            S_Sum = 0
            pred = 0

            for i in k_ind:
                baseline = mu + a[user] + b[i]
                pred = pred + S[movie, i] * (A[user, i] - baseline)
                S_Sum = abs(S_Sum) + S[movie, i]
            
            if S_Sum != 0:
                prediction[index] = mu + a[user] + b[movie] + pred/S_Sum
            
            # New Movie
            else:
                prediction[index] = mu + a[user] + b[movie]
        
        # Display Status
        if (index % 10000) == 0:
            print(index/len(test)*100,'\b%')

    return prediction

# Predict
neighborhood_pred = neighborhood_predict(data, test, A, S, k, a, b, mu)

# SUBMISSION
submit(neighborhood_pred, 'Submissions/Neighborhood2.csv')

0.0%
0.43384173800470976%
0.8676834760094195%
1.3015252140141294%
1.735366952018839%
2.1692086900235488%
2.6030504280282587%
3.036892166032968%
3.470733904037678%
3.9045756420423885%
4.3384173800470975%
4.7722591180518075%
5.206100856056517%
5.639942594061227%
6.073784332065936%
6.507626070070646%
6.941467808075356%
7.375309546080066%
7.809151284084777%
8.242993022089486%
8.676834760094195%
9.110676498098906%
9.544518236103615%
9.978359974108326%
10.412201712113035%
10.846043450117744%
11.279885188122455%
11.713726926127164%
12.147568664131873%
12.581410402136584%
13.015252140141293%
13.449093878146003%
13.882935616150712%
14.316777354155422%
14.750619092160132%
15.184460830164841%
15.618302568169554%
16.052144306174263%
16.485986044178972%
16.91982778218368%
17.35366952018839%
17.7875112581931%
18.221352996197812%
18.65519473420252%
19.08903647220723%
19.522878210211942%
19.95671994821665%
20.39056168622136%
20.82440342422607%
21.25824516223078%
21.692086900235488%
22.1259286382402%
2