In [48]:
import pandas as pd
import numpy as np
import sys
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.svm import SVR
ATTRIBUTE_SZ = 1128
TRAIN_FILE = '../data/train.csv'
SPLIT_TRAIN = '../data/split_train.csv'
TEST_FILE = '../data/test.csv'
SPLIT_TEST = '../data/split_test.csv'
ATTR_DIR = '../data/genome_scores.csv'

In [50]:
def process_genome_scores():
    i = 0
    data = {}
    print("Processing movie attribute data...")
    GENOME_DATA = pd.read_csv(ATTR_DIR)
    MOVIE_SZ = GENOME_DATA.shape[0]/ATTRIBUTE_SZ
    while(i < MOVIE_SZ):
        movie_data = GENOME_DATA.iloc[i*ATTRIBUTE_SZ:(i+1)*ATTRIBUTE_SZ,2]
        data[GENOME_DATA.iloc[i*ATTRIBUTE_SZ,0]] = movie_data.values.reshape(ATTRIBUTE_SZ,)
        i += 1
    del GENOME_DATA
    print("Movie data processed!")
    return data

def group_train_data(train_file):
    train_data = pd.read_csv(train_file)
    train_data = train_data.groupby(["userId"])
    grouped_data = {}
    for group in train_data.groups:
        grouped_data[group] = train_data.get_group(group).values
    
    del train_data
    return grouped_data

def get_mean_ratings(train_file):
    train_data = pd.read_csv(train_file)
    train_data = train_data.values
    movie_mean_rating = {}
    user_mean_rating = {}
    for i in range(train_data.shape[0]):
        userid = train_data[i,0]
        movieid = train_data[i,1]
        rating = train_data[i,2]
        if userid not in user_mean_rating:
            user_mean_rating[userid] = np.array([rating,1])
        else:
            user_mean_rating[userid][0] += rating
            user_mean_rating[userid][1] += 1
        if movieid not in movie_mean_rating:
            movie_mean_rating[movieid] = np.array([rating,1])
        else:
            movie_mean_rating[movieid][0] += rating
            movie_mean_rating[movieid][1] += 1
    mean_rating = np.mean(train_data[:,2])
    
    for k in movie_mean_rating:
        movie_mean_rating[k][0] = movie_mean_rating[k][0]/movie_mean_rating[k][1]
    for k in user_mean_rating:
        user_mean_rating[k][0] = user_mean_rating[k][0]/user_mean_rating[k][1]

    del train_data
    return movie_mean_rating, user_mean_rating, mean_rating

genome_scores = process_genome_scores()
train_data = group_train_data(SPLIT_TRAIN)
movie_mean_rating, user_mean_rating, mean_rating = get_mean_ratings(SPLIT_TRAIN)

Processing movie attribute data...
Movie data processed!


In [None]:
def process_train_data(train_data, genome_scores):
    N = train_data.shape[0]
    d = ATTRIBUTE_SZ
    X = np.zeros((N,d))
    y = np.zeros((N,))
    train_data = train_data[:,1:3]
    for i in range(N):
        movieid = train_data[i,0]
        if movieid in genome_scores:
            X[i,:] = genome_scores[movieid]
        y[i] = train_data[i,1]
    return X, y

def train_model(train_data, genome_scores, kernel = 'none', type = 'LR'):
    N = train_data.shape[0]
    d = ATTRIBUTE_SZ
    
    X, y = process_train_data(train_data, genome_scores)
    if type == 'Ridge':
        regr = RidgeCV(alphas = [0.05, 0.5, 1.0])
    elif(type == 'LR'):
        regr = LinearRegression()
    elif(type == 'SVR'):
        regr = SVR(gamma = 'scale')
    regr.fit(X, y)
    return regr

def train(train_data, genome_scores):
    print("Training linear regression models...", end ="")
    models = {}
    i = 0
    for user in train_data:
        models[user] = train_model(train_data[user], genome_scores, type = 'SVR')
        if i%100 == 0:
            print("...", end ="")
        if i%1000 == 0 and i > 0:
            print("\n %d users trained" % (i))
            print("Still training",end="")
        i += 1
    
    return models

models = train(train_data, genome_scores)

Training linear regression models..................

In [45]:
def predict_rating(test_file, models, genome_scores, mean, test = False):
    test_data = pd.read_csv(test_file).values
    print("Evaluating model...",end="") 
    movie_mean_rating, user_mean_rating, mean_rating = mean
    N = test_data.shape[0]
    ids = np.arange(0, N, 1)
    pred = np.zeros((N,1))
    counter = 0
    for i in range(N):
        userid = test_data[i,0]; movieid = test_data[i,1]
        if userid in models:
            # if movie has genome score
            if movieid in genome_scores:
                phi = genome_scores[movieid]
                phi = phi.reshape(1,len(phi))
                pred[i,0] = models[userid].predict(phi)
            # if genome score not available
            else:
                if movieid in movie_mean_rating:
                    if userid in user_mean_rating:
                        pred[i,0] = 0.1*mean_rating + 0.2*movie_mean_rating[movieid][0] + 0.7*user_mean_rating[userid][0]
                    else:
                        pred[i,0] = 0.6*mean_rating + 0.4*movie_mean_rating[movieid][0]
                else:
                    if userid in user_mean_rating:
                        pred[i,0] = 0.4*mean_rating + 0.6*user_mean_rating[userid][0]
                    else:
                        pred[i,0] = mean_rating
        pred[i,0] = round(float(pred[i,0]),1)
        if pred[i,0] > 5.0:
            pred[i,0] = 5.0
        elif pred[i,0] < 0.5:
            pred[i,0] = 0.5    
            
        if test == True:
            t = test_data[i,2]
            pred[i,0] = (pred[i,0] - t)**2
            
        if(counter%500000 == 0):
            print("...", end="")
        counter += 1
    
    print("\nDone!")
    del test_data
        
    if test == True:
        return np.sum(pred)/len(pred)
    else:
        df = pd.DataFrame(idd, columns = ['Id'])
        df['Prediction'] = pred
        df.to_csv('data/results.csv', index=False)                                                                                                                                                      

In [38]:
mean = [movie_mean_rating, user_mean_rating, mean_rating]
mse = predict_rating(SPLIT_TRAIN, models, genome_scores, mean, test = True)

Evaluating model...........................
Done!


0.026369160672829423
