In [34]:
import pdb;

def run_gradient_descent(n_epochs, batch_size, data, eta):
    
    data.reset_index(inplace = True, drop = True)
    
    # Unknown parameters
    a = np.zeros(n_users)  # User variations
    b = np.zeros(n_movies) # Movie variations
    
    mu = np.mean(data['rating'])
    predictions = np.zeros(len(data))
    n_batches = int(len(data)/batch_size)
    
    # Create indices for every batch
    # Of the form [-1, batch_size, 2*batchsize, ... len(data)]
    indices = [batch_size*x for x in range(n_batches)]
    indices[0] = -1
    indices.append(len(data) - 1)
            
    # Gradient Descent Begins
    print('Batch Size - ', batch_size, ', Eta - ', eta)
    exit = 0
    
    for epoch in range(n_epochs):
    
        # Display for every epoch - Epoch number and error
        print('Epoch-',(epoch+1),'/',n_epochs)
        err = np.mean(abs(data['rating'] - predictions));
        print('Error =',err)

        # For every mini-batch
        for batch_no in range(n_batches):

            # Select mini-batch
            batch = data.loc[indices[batch_no]+1:indices[batch_no+1]]

            # Display after every 10% completetion
            if (batch_no+1) % (int(n_batches/10)) == 0:
                print('Completetion - ', np.ceil((batch_no+1)/n_batches*100), '\b%')
                curr_err = (np.mean(abs(predictions[predictions!=0] - data.loc[(predictions!=0), 'rating'])))
                print(curr_err)
                if (curr_err == np.inf) or np.isnan(curr_err):
                    exit = 1
                    print('Breaking Out...')
                    break
       
            # Predictions for the selected mini-batch using the current a and b
            for index, x in batch.iterrows():
                predictions[index] = mu + a[int(x['userId'])] + b[int(x['movieId'])]

            # Updating a and b using the predictions on the current mini-batch
            for index, x in batch.iterrows():
                mu = mu + eta * (x['rating'] - predictions[index])
                a[int(x['userId'])] = a[int(x['userId'])] + eta * (x['rating'] - predictions[index])
                b[int(x['movieId'])] = b[int(x['movieId'])] + eta * (x['rating'] - predictions[index])
            
        if exit == 1:
            exit = 0
            print('Very large error - Moving to next search')
            break

    return (a, b, mu)

def predict(data, a, b, mu):
    
    data.reset_index(inplace = True, drop = True)
    
    predictions = {}
    
    for index, point in data.iterrows():
        predictions[index] = mu + a[int(point['userId'])] + b[int(point['movieId'])]
        
    return predictions

In [27]:
import numpy as np
import pandas as pd

# Import files
filename = 'data/train.csv'
data = pd.read_csv(filename)

n_movies = 10000
n_users = 10000

# Gradiet Descent Hyperparameters
n_epochs = 3
min_err = np.inf

indices = np.random.permutation(len(data))
train_length = int(len(data)*0.8)
train_data = data.loc[indices[0:train_length]]
test_data = data.loc[indices[(train_length+1):]]

for batch_size in [500, 750, 1000, 2000]:
    for eta in [0.0001, 0.0005, 0.001]:
        (a, b, mu) = run_gradient_descent(n_epochs, batch_size, train_data, eta)
        test_pred = predict(test_data, a, b, mu)
        error = np.mean(abs(test_data['rating'] - test_pred.values))
        if error < min_err:
            min_err = error
            selection = (batch_size, eta)

print(selection)

Batch Size -  500 , Eta -  0.0001
Epoch- 1 / 3
Error = 3.360016151137798
Completetion -  10.0%
0.8558554606872634
Completetion -  20.0%
0.8530078491152735
Completetion -  30.0%
0.8506255943626195
Completetion -  40.0%
0.848195778902555
Completetion -  50.0%
0.8461298544773216
Completetion -  60.0%
0.8442279201591683
Completetion -  70.0%
0.8421543494309883
Completetion -  80.0%
0.8401361897429368
Completetion -  90.0%
0.8381675058149144
Completetion -  100.0%
0.8361117265107368
Epoch- 2 / 3
Error = 0.8360992467930765
Completetion -  10.0%
0.8320931088647207
Completetion -  20.0%
0.8282022622893224
Completetion -  30.0%
0.8244473382770704
Completetion -  40.0%
0.8208003755394985
Completetion -  50.0%
0.8172649321512699
Completetion -  60.0%
0.8138180266615703
Completetion -  70.0%
0.810502669535986
Completetion -  80.0%
0.807252061251293
Completetion -  90.0%
0.8041233052410224
Completetion -  100.0%
0.8010597234928368
Epoch- 3 / 3
Error = 0.8010450615292942
Completetion -  10.0%
0.7980

Completetion -  90.0%
0.6646704872570074
Completetion -  100.0%
0.6634824212260757
Batch Size -  1000 , Eta -  0.0001
Epoch- 1 / 3
Error = 3.360016151137798
Completetion -  10.0%
0.8558302010005897
Completetion -  20.0%
0.8530203135068435
Completetion -  30.0%
0.8506034169600557
Completetion -  40.0%
0.8482095467181517
Completetion -  50.0%
0.8461413866690588
Completetion -  60.0%
0.8442221515202505
Completetion -  70.0%
0.8421558859799095
Completetion -  80.0%
0.840140921737226
Completetion -  90.0%
0.8381711761410443
Completetion -  100.0%
0.8361134702845511
Epoch- 2 / 3
Error = 0.8361002673133113
Completetion -  10.0%
0.8320985244382835
Completetion -  20.0%
0.8282069851392657
Completetion -  30.0%
0.8244532541241785
Completetion -  40.0%
0.8208053537298162
Completetion -  50.0%
0.8172694981671808
Completetion -  60.0%
0.8138233514794263
Completetion -  70.0%
0.8105077678728378
Completetion -  80.0%
0.8072555824026993
Completetion -  90.0%
0.8041275471018916
Completetion -  100.0%
0

KeyboardInterrupt: 

In [46]:
selection = (750, 0.001)
(a, b, mu) = run_gradient_descent(10, selection[0], train_data, selection[1])

filename = 'data/test.csv'
test = pd.read_csv(filename)

prediction = predict(test, a, b, mu)
submission = test.from_dict(prediction, orient='index', columns=['Prediction'])
submission.to_csv('Baseline.csv', index_label = 'Id')

Batch Size -  750 , Eta -  0.001
Epoch- 1 / 10
Error = 3.360016151137798
Completetion -  10.0%
0.8369136361329713
Completetion -  20.0%
0.8189683200594117
Completetion -  30.0%
0.8044623068577861
Completetion -  40.0%
0.7923228123527694
Completetion -  50.0%
0.7822991191228907
Completetion -  60.0%
0.7738377412571612
Completetion -  70.0%
0.7663887441753489
Completetion -  80.0%
0.7597777412361253
Completetion -  90.0%
0.7539694561109184
Completetion -  100.0%
0.748559157088695
Epoch- 2 / 10
Error = 0.7484999516583284
Completetion -  10.0%
0.7345008130307561
Completetion -  20.0%
0.7235502464413303
Completetion -  30.0%
0.714808811718086
Completetion -  40.0%
0.7076256258688673
Completetion -  50.0%
0.7015683255942867
Completetion -  60.0%
0.6964187607291727
Completetion -  70.0%
0.6919737162443028
Completetion -  80.0%
0.6881051189670206
Completetion -  90.0%
0.6846857820854279
Completetion -  100.0%
0.6816219834829985
Epoch- 3 / 10
Error = 0.681589743437053
Completetion -  10.0%
0.67

TypeError: to_csv() got an unexpected keyword argument 'path'

In [58]:
import xlsxwriter
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('BaselineParameters.xlsx')
worksheet = workbook.add_worksheet()

# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
for a_item in a:
    worksheet.write(row, col, a_item)
    row += 1
    
col = 1
row = 0
for b_item in b:
    worksheet.write(row, col, b_item)
    row += 1

col = 2
row = 0
worksheet.write(row, col, mu)

workbook.close()

0.051271298186832986

In [44]:
test.from_dict(prediction, orient='index', columns=['rating'])

Unnamed: 0,rating
0,3.738977
1,3.302347
2,2.818280
3,3.769383
4,3.348808
5,3.010602
6,3.764318
7,3.223920
8,3.598159
9,2.800680
