In [1]:
import pandas as pd
import numpy as np
import time
import math
import matplotlib.pyplot as plt
import sys

ATTR_DIR = 'data/genome_scores.csv'
TRAIN_DIR = 'data/train.csv'
TEST_DIR = 'data/test.csv'
GENRE_DIR = 'data/movies.csv'
ATTRIBUTE_SZ = 1128
USER_SZ = 7632
GENRE_SZ = 19

In [2]:
def process_genome_scores():
    i = 0
    data = {}
    print("Processing movie attribute data...")
    GENOME_DATA = pd.read_csv(ATTR_DIR)
    MOVIE_SZ = GENOME_DATA.shape[0]/ATTRIBUTE_SZ
    while(i < MOVIE_SZ):
        movie_data = GENOME_DATA.iloc[i*ATTRIBUTE_SZ:(i+1)*ATTRIBUTE_SZ,2]
        data[GENOME_DATA.iloc[i*ATTRIBUTE_SZ,0]] = movie_data.values.reshape(ATTRIBUTE_SZ,1)
        i += 1
    del GENOME_DATA
    print("Movie data processed!")
    return data

def process_genres():
#     # to hold genre vectors
    genre_data = {}
    # to hold movies under a genre
    genre_movieid = {}
    # to holds genres for movieid
    movie_genres = {}
    
    print("Retrieving genres for movies...")
    GENRE_DATA = pd.read_csv(GENRE_DIR)
    for i in range(GENRE_DATA.shape[0]):
        movieid = GENRE_DATA.iloc[i,0]
        genres = GENRE_DATA.iloc[i,2].split("|")
        genre_vec = convert_genres(genres)
        genre_data[movieid] = genre_vec
        
        movie_genres[movieid] = genres
        
        for genre in genres:
            if genre not in genre_movieid:
                genre_movieid[genre] = [movieid]
            else:
                (genre_movieid[genre]).append(movieid)
    
    del GENRE_DATA
    return genre_movieid, movie_genres, genre_data

def get_genres_mean(genres, genre_mean, genre_movieid):
    mean = 0
    for genre in genres:
        mean += genre_mean[genre][0]/genre_mean[genre][1]
    return mean/len(genres)
        
def convert_genres(genres):
    genre_vec = np.zeros((19,1))
    for i in range(len(genres)):
        if(genres[i] == 'Action'):
            genre_vec[0] = 1
        elif(genres[i] == 'Adventure'):
            genre_vec[1] = 1
        elif(genres[i] == 'Animation'):
            genre_vec[2] = 1
        elif(genres[i] == 'Children'):
            genre_vec[3] = 1
        elif(genres[i] == 'Comedy'):
            genre_vec[4] = 1
        elif(genres[i] == 'Crime'):
            genre_vec[5] = 1
        elif(genres[i] == 'Documentary'):
            genre_vec[6] = 1
        elif(genres[i] == 'Drama'):
            genre_vec[7] = 1
        elif(genres[i] == 'Fantasy'):
            genre_vec[8] = 1
        elif(genres[i] == 'Film-Noir'):
            genre_vec[9] = 1
        elif(genres[i] == 'Horror'):
            genre_vec[10] = 1
        elif(genres[i] == 'Musical'):
            genre_vec[11] = 1
        elif(genres[i] == 'Mystery'):
            genre_vec[12] = 1
        elif(genres[i] == 'Romance'):
            genre_vec[13] = 1
        elif(genres[i] == 'Sci-Fi'):
            genre_vec[14] = 1
        elif(genres[i] == 'Thriller'):
            genre_vec[15] = 1
        elif(genres[i] == 'War'):
            genre_vec[16] = 1
        elif(genres[i] == 'Western'):
            genre_vec[17] = 1
        else:
            genre_vec[18] = 1
    return genre_vec

In [3]:
def train_user_model(data, W, aux_data, eta, lam):
    userid, movieid, rating = data
    movie_data, genre_movieid, ratings_mean, movie_genres, genre_mean, mean = aux_data
    
    # if new user, then create an entry to weight matrix
    if userid not in W:
        W[userid] = np.zeros((ATTRIBUTE_SZ,1))

    # if movie does not have genome scores ignore
    if movieid not in movie_data:
        phi = np.zeros((ATTRIBUTE_SZ,1))
    else:
        # get feature vector for movie
        phi = movie_data[movieid]
        phi = phi/np.linalg.norm(phi)    

    # get linear regression weights for user
    w = W[userid]
    # update regression weights for user
    w = w + eta*((rating - (w.T).dot(phi))*phi - lam*w)
    # update the weight matrix
    W[userid] = w
    # update movie mean ratings
    if movieid not in ratings_mean:
        ratings_mean[movieid] = np.array([rating, 1])
    else:
        ratings_mean[movieid][0] += rating
        ratings_mean[movieid][1] += 1
    # update genre mean rating
    genres = movie_genres[movieid]
    for genre in genres:
        if genre not in genre_mean:
            genre_mean[genre] = np.array([rating, 1])
        else:
            genre_mean[genre][0] += rating
            genre_mean[genre][1] += 1
            
def train_model(aux_data, eta, epochs, lam = 0):
    # initialise dict to hold weights for users
    W = {}
    # initialise dict to hold movie mean ratings
    ratings_mean = {}
    # initialise dict to hold genre mean ratings
    genre_mean = {}

    movie_data, genre_movieid, movie_genres = aux_data
    
    train_data = pd.read_csv(TRAIN_DIR)
    train_data = train_data.values
    mean = np.mean(train_data[:,2])
    aux_data = [movie_data, genre_movieid, ratings_mean, movie_genres, genre_mean, mean]
    print("Training model:")
    epoch = 0
    while(epoch < epochs):
        print("Epoch No: %d" % (epoch))
        for i in range(train_data.shape[0]):
            data = train_data[i,:]
            train_user_model(data, W, aux_data, eta, lam)
            if(i%400000 == 0):
                print("Progress: %.2f percent" %(i/train_data.shape[0]*100))
        epoch += 1
        
    del train_data
    return W, aux_data

In [4]:
def predict_rating(userid, movieid, W, aux_data):
    movie_data, genre_movieid, ratings_mean, movie_genres, genre_mean, mean = aux_data
    genres = movie_genres[movieid]
    # if user not encountered during training
    if userid not in W:
        # if movie not encountered during training predict mean rating for the genre
        if movieid not in ratings_mean:
            pred = (0.7*mean + 0.3*get_genres_mean(genres, genre_mean, genre_movieid))
        else:
        # movie encountered, predict weighted average of genre mean rating and movie mean rating
            pred = 0.3*mean + 0.1*get_genres_mean(genres, genre_mean, genre_movieid) + 0.6*ratings_mean[movieid][0]/ratings_mean[movieid][1]

    else:
        w = W[userid]
        # movie does not have feature vector
        if movieid not in movie_data:
            if movieid in ratings_mean:
                # if movie encountered during training, take weighted average 
                pred = 0.1*mean + 0.3*get_genres_mean(genres, genre_mean, genre_movieid) + 0.6*ratings_mean[movieid][0]/ratings_mean[movieid][1]
            else:
                # if movie not encountered during training, predict genre mean rating
                pred = 0.2*mean + 0.8*get_genres_mean(genres, genre_mean, genre_movieid)
                
        else:
            phi = movie_data[movieid]
           # phi = np.vstack((1,phi))
            phi = phi/np.linalg.norm(phi)
            pred = (w.T).dot(phi)

    pred = round(float(pred),1)

    if pred > 5.0:
        pred = 5.0
    elif pred < 0.5:
        pred = 0.5

    return pred

def predictions(W, aux_data):   
   # movie_data, genre_movieid, ratings_mean, movie_genres, genre_mean, mean = aux_data
    print("Beginning test data evaluation...")
    test_data = pd.read_csv(TEST_DIR)
    test_data = test_data.values
    pred = np.zeros((test_data.shape[0],1))
    idd = np.arange(0, test_data.shape[0],1)
    for i in range(test_data.shape[0]):
        ids = test_data[i,:]
        pred[i] = predict_rating(ids[0], ids[1], W, aux_data)
        if(i%500000 == 0):
            print("Progress: %.2f percent" %(i/test_data.shape[0]*100))

    df = pd.DataFrame(idd, columns = ['Id'])
    df['Prediction'] = pred
    df.to_csv('data/results.csv', index=False)
    del test_data

def split_pred(W, aux_data):
    print("Beginning test data evaluation...")
    test_data = pd.read_csv('data/split_test.csv')
    test_data = test_data.values
    error = 0; total = 0;
    for i in range(test_data.shape[0]):
        ids = test_data[i,:]
        pred = predict_rating(ids[0], ids[1], W, aux_data)
        error += (pred-ids[2])**2
        total += 1
        if(i%400000 == 0):
            print("Progress: %.2f percent" %(i/test_data.shape[0]*100))
    return error/total

In [None]:
def RSA_training(data, W, nvar):
    userid, movieid, rating = data
    # if new user, then create an entry to weight matrix
    if userid not in W:
        W[userid] = np.zeros((nvar,1))

    # if movie does not have genome scores ignore
    if movieid not in movie_data:
        phi = np.zeros((nvar,1))
    else:
        # get feature vector for movie
        phi = movie_data[movieid][0:nvar,0]
       # phi = np.vstack((1,phi))
        phi = phi/np.linalg.norm(phi)    

    # get linear regression weights for user
    w = W[userid]
    # update regression weights for user
    w = w + eta*((rating - (w.T).dot(phi))*phi - lam*w)
    # update the weight matrix
    W[userid] = w

def calculate_SSR(test_file, train_file, )
def r_squared_analysis():
    

In [5]:
# get attribute relevance scores for each movie
movie_data = process_genome_scores()
genre_movieid, movie_genres, genre_data = process_genres()
aux_data = [movie_data, genre_movieid, movie_genres]
W, aux_data = train_model(aux_data, 0.5, 2)

Processing movie attribute data...
Movie data processed!
Retrieving genres for movies...
Training model:
Epoch No: 0
Progress: 0.00 percent
Progress: 7.60 percent
Progress: 15.20 percent
Progress: 22.79 percent
Progress: 30.39 percent
Progress: 37.99 percent
Progress: 45.59 percent
Progress: 53.19 percent
Progress: 60.79 percent
Progress: 68.38 percent
Progress: 75.98 percent
Progress: 83.58 percent
Progress: 91.18 percent
Progress: 98.78 percent
Epoch No: 1
Progress: 0.00 percent
Progress: 7.60 percent
Progress: 15.20 percent
Progress: 22.79 percent
Progress: 30.39 percent
Progress: 37.99 percent
Progress: 45.59 percent
Progress: 53.19 percent
Progress: 60.79 percent
Progress: 68.38 percent
Progress: 75.98 percent
Progress: 83.58 percent
Progress: 91.18 percent
Progress: 98.78 percent


In [7]:
#print("MSE = %f" % (split_pred(W, aux_data)))
predictions(W, aux_data)

Beginning test data evaluation...
Progress: 0.00 percent
Progress: 21.69 percent
Progress: 43.38 percent
Progress: 65.08 percent
Progress: 86.77 percent
