In [1]:
__author__ = 'MegEllis'

import json
import collections
import numpy as np
import math

import os
import sys

os.environ['SPARK_HOME']= "/Users/MegEllis/Desktop/spark-1.6.0-bin-hadoop2.6_2"

sys.path.append("/Users/MegEllis/Desktop/spark-1.6.0-bin-hadoop2.6_2/python/")

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SQLContext, Row
from pyspark.sql import functions
conf = (SparkConf().setMaster("local").setAppName("My app").set("spark.executor.memory", "1g"))
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)

we create a default dictionary that maps users to each game they've played and for how long over their lifetime

In [2]:
users_open = open('/Users/MegEllis/Desktop/aml_proj/userData2.txt', 'r+')


def nest_dict(filename):
    listed_dict = []
    for line in filename:
        json_lines = json.loads(line)
        rec_dict = collections.defaultdict(dict)
        user = json_lines['user']
        in_response = json_lines['ownedGames']['response']
        if 'games' in in_response:
            for i in in_response['games']:
                rec_dict[user][i['name']] = i['playtime_forever']
            listed_dict.append(rec_dict)
    return listed_dict


final_list = nest_dict(users_open)

we then transform this list of nested dictionaries into an RDD to more efficiently and quickly format the data and find relevent information. For example, we need to standardize the playtime for a given time since some games can only be played for a certain amount of time. 

In [3]:
final_RDD = sc.parallelize(final_list)

in order to run statisical analysis on the data, it must be placed in a tuple which is then transformed into an RDD

In [4]:
tup_list = []
for i in range(len(final_list)):
    tup_list.extend(final_list[i].values()[0].items())

In [5]:
time_count_RDD = sc.parallelize(tup_list)

From this tuple we can calculate the average number of hours played per game

In [6]:
tottime_per_game = time_count_RDD.filter(lambda x: x[1] != 0).mapValues(lambda x: (x, 1))\
                .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

In [7]:
avg_per_game = tottime_per_game.map(lambda x: (x[0], float(x[1][0])/float(x[1][1])))

In [8]:
full_list = []
for i in range(len(final_list)):
    user = final_list[i].keys()[0]
    tup_list = final_list[i].values()[0].items()
    for j in range(len(tup_list)):
        full_list.extend([(tup_list[j][0], user, tup_list[j][1])])

        
full_list_RDD = sc.parallelize(full_list).filter(lambda x: x[2] != 0)

Standardizing: Now that we have the average playtime for each game, we get the proportion of the number of hours played per player for a given game over the game's average. To do this, we use spark sql to join the table that contains the information of users and and their playtime to the table that contains information of each game and its average playtime. 

In [9]:
users = full_list_RDD.map(lambda x: Row(game = x[0], user = x[1], playtime = x[2]))

In [10]:
Users = sqlContext.createDataFrame(users)
Users.registerTempTable("users")

In [11]:
avg_times = avg_per_game.map(lambda x: Row(game = x[0], avg_time = x[1]))

In [12]:
AvgTimes = sqlContext.createDataFrame(avg_times)
AvgTimes.registerTempTable("AvgTimes")

In [13]:
jointest=sqlContext.sql('''SELECT users.game, users.user, users.playtime, AvgTimes.avg_time
                    FROM users LEFT JOIN AvgTimes
                    ON users.game = AvgTimes.game
                    ''')

Make this dataframe that resulted from the join into an RDD to easily extract info

In [14]:
useful = jointest.select(jointest.game, jointest.user, functions.log(jointest.playtime), \
                         functions.log(jointest.avg_time))

In [15]:
full_rdd = useful.rdd

In [16]:
to_get_prop = full_rdd.map(lambda x: (x[0], x[1], x[2], x[3]))

In [17]:
get_prop = to_get_prop.map(lambda x: (x[0], (x[1], (x[2] + 1)/(x[3] +1))))

In [18]:
exit_dict = get_prop.collect()

recreate the default dictionary so that we can perform the similarity, rating, and overall RMSE and mean error functions

In [19]:
user_test = collections.defaultdict(dict)
game_test = collections.defaultdict(dict)
user_key = collections.defaultdict(dict)
game_key = collections.defaultdict(dict)
n = 0
for i in range(len(exit_dict)):
    if n % 100 == 0:
        user_test[exit_dict[i][1][0]][exit_dict[i][0]] = exit_dict[i][1][1]
        game_test[exit_dict[i][0]][exit_dict[i][1][0]] = exit_dict[i][1][1]
    else:
        user_key[exit_dict[i][1][0]][exit_dict[i][0]] = exit_dict[i][1][1]
        game_key[exit_dict[i][0]][exit_dict[i][1][0]] = exit_dict[i][1][1]
    
    n += 1

going to test both splitting into evently sized bins and just testing the ratios on their own.
First, make sure we can attain similarity and rating for a certain number of users - treated as the test 

In [20]:
all_ratings = []
for u in user_key:
    all_ratings.extend(user_key[u].values())
all_ratings.sort()

median = np.median(all_ratings)

In [21]:
def similarity(i, j, dicto):
    i_rating_avg = np.mean(dicto[i].values())
    j_rating_avg = np.mean(dicto[j].values())
    k = list(set(dicto[i].keys()) & set(dicto[j].keys()))
    if k == []:
        similar = 1
        exit
    else:
        num = 0
        denom1 = 0
        denom2 = 0
        for t in range(len(k)):
            num = num + ((dicto[i][k[t]] - i_rating_avg)) * ((dicto[j][k[t]] - j_rating_avg))
            denom1 = denom1 + (((dicto[i][k[t]] - i_rating_avg)**2))
            denom2 = denom2 + ((dicto[j][k[t]] - j_rating_avg)**2)
        denom = (denom1 * denom2)
        denom_sqrt = math.sqrt(denom)
        similar = (num+1)/(denom_sqrt + 1)
    return similar

In [22]:
def get_rating(user, game, game_key, user_key):
    user_avg_rating = np.mean(user_key[user].values())
    game_user_list = game_key[game].keys()
    term_1 = 0
    term_2 = 0
    if len(game_user_list) > 0:
        for m in game_user_list:
            sim = similarity(user, m, user_key)
            term_1 = term_1 + abs(sim)
            term_2 = term_2 + (sim * ((game_key[game][m]) - np.mean(user_key[m].values())))
    rating = user_avg_rating + ((1/float(term_1)) * term_2)     
    return rating

For missing data

In [23]:
def missing_user(game):
    rating = np.median(game_key[game].values())
    return rating 

def missing_game(user):
    rating = np.median(user_key[user].values())
    return rating


TRAINING

In [24]:
user_list = user_key.keys()

In [25]:
# diffies = []
# diffies2 = []
# for u in user_list:
#     for g in user_key[u]:
#         actual = user_key[u][g]
#         new = get_rating(u, g, game_key, user_key)
#         diffies.append(abs(new - actual))
#         diffies2.append((new - actual)**2)        

In [26]:
# mean_abs_error = np.mean(diffies)
# mean_abs_error

In [27]:
# RMSD = math.sqrt(sum(diffies2)/len(diffies2))
# RMSD

TEST

In [28]:
user_test_list = user_test.keys()

In [29]:
diffies = []
diffies2 = []
for u in user_test:
    for g in user_test[u]:
        if u in user_key.keys():
            if g in game_key.keys():
                new = get_rating(u, g, game_key, user_key)
            elif g not in game_key.keys():
                new = missing_game(u)
        elif u not in user_key.keys():
            if g in game_key.keys():
                new = missing_user(g)
            else:
                new = median
        actual = user_test[u][g]
        diffies.append(abs(new - actual))
        diffies2.append((new - actual)**2) 

In [30]:
mean_abs_error = np.mean(diffies)
mean_abs_error

0.1785941273501544

In [31]:
RMSD = math.sqrt(sum(diffies2)/len(diffies2))
RMSD

0.2353086214019304

INTEGER RATING METHOD

In [32]:
useful_int = jointest.select(jointest.game, jointest.user, jointest.playtime, jointest.avg_time)

In [33]:
full_rdd_int = useful_int.rdd

In [34]:
int_last_rdd = full_rdd.map(lambda x: (x[0], x[1], (x[2]+1)/(x[3]+1)))

In [35]:
int_exit_dict = int_last_rdd.collect()

In [36]:
user_test = collections.defaultdict(dict)
game_test = collections.defaultdict(dict)
user_key = collections.defaultdict(dict)
game_key = collections.defaultdict(dict)
n = 0
for i in range(len(int_exit_dict)):
    if n % 100 == 0:
        user_test[int_exit_dict[i][1]][int_exit_dict[i][0]] = int_exit_dict[i][2]
        game_test[int_exit_dict[i][0]][int_exit_dict[i][1]] = int_exit_dict[i][2]
    else:
        user_key[int_exit_dict[i][1]][int_exit_dict[i][0]] = int_exit_dict[i][2]
        game_key[int_exit_dict[i][0]][int_exit_dict[i][1]] = int_exit_dict[i][2]
    
    n += 1

In [37]:
all_ratings = []
for u in user_key:
    all_ratings.extend(user_key[u].values())
all_ratings.sort()


In [38]:
int_dict = collections.defaultdict(dict)
conv = np.percentile(all_ratings, [20, 40, 60, 80, 100])
for u in user_key:
    for g in user_key[u]:
        rating = user_key[u][g]
        if rating < conv[0]:
            num = 1
        elif rating < conv[1]:
            num = 2
        elif rating < conv[2]:
            num = 3
        elif rating < conv[3]:
            num = 4
        elif rating < conv[4]:
            num = 5

        int_dict[u][g] = num

In [39]:
all_ratings = []
for u in int_dict:
    all_ratings.extend(int_dict[u].values())
all_ratings.sort()

median = np.median(all_ratings)
median

3.0

In [40]:
diffies = []
diffies2 = []
for u in user_test:
    for g in user_test[u]:
        if u in user_key.keys():
            if g in game_key.keys():
                new = get_rating(u, g, game_key, int_dict)
            elif g not in game_key.keys():
                new = missing_game(u)
        elif u not in user_key.keys():
            if g in game_key.keys():
                new = missing_user(g)
            else:
                new = median
        actual = user_test[u][g]
        diffies.append(abs(new - actual))
        diffies2.append((new - actual)**2) 

In [41]:
mean_abs_error = np.mean(diffies)
mean_abs_error

0.92737725287382533

In [42]:
RMSD = math.sqrt(sum(diffies2)/len(diffies2))
RMSD

1.1173056924795912

WITH INTEGER AND LOG

In [43]:
useful_int = jointest.select(jointest.game, jointest.user, functions.log(jointest.playtime), \
                             functions.log(jointest.avg_time))

In [44]:
full_rdd_int = useful_int.rdd

In [45]:
int_last_rdd = full_rdd.map(lambda x: (x[0], x[1], (x[2]+1)/(x[3]+1)))

In [46]:
int_log_exit_dict = int_last_rdd.collect()

In [47]:
user_test = collections.defaultdict(dict)
game_test = collections.defaultdict(dict)
user_key = collections.defaultdict(dict)
game_key = collections.defaultdict(dict)
n = 0
for i in range(len(int_log_exit_dict)):
    if n % 100 == 0:
        user_test[int_log_exit_dict[i][1]][int_log_exit_dict[i][0]] = int_log_exit_dict[i][2]
        game_test[int_log_exit_dict[i][0]][int_log_exit_dict[i][1]] = int_log_exit_dict[i][2]
    else:
        user_key[int_log_exit_dict[i][1]][int_log_exit_dict[i][0]] = int_log_exit_dict[i][2]
        game_key[int_log_exit_dict[i][0]][int_log_exit_dict[i][1]] = int_log_exit_dict[i][2]
    
    n += 1

In [48]:
all_ratings = []
for u in user_key:
    all_ratings.extend(user_key[u].values())
all_ratings.sort()



In [49]:
int_log_dict = collections.defaultdict(dict)
conv = np.percentile(all_ratings, [20, 40, 60, 80, 100])
for u in user_key:
    for g in user_key[u]:
        rating = user_key[u][g]
        if rating < conv[0]:
            num = 1
        elif rating < conv[1]:
            num = 2
        elif rating < conv[2]:
            num = 3
        elif rating < conv[3]:
            num = 4
        else:
            num = 5
        int_log_dict[u][g] = num

In [50]:
all_ratings = []
for u in int_log_dict:
    all_ratings.extend(int_log_dict[u].values())
all_ratings.sort()

median = np.median(all_ratings)

In [51]:
diffies = []
diffies2 = []
for u in user_test:
    for g in user_test[u]:
        if u in user_key.keys():
            if g in game_key.keys():
                new = get_rating(u, g, game_key, int_log_dict)
            elif g not in game_key.keys():
                new = missing_game(u)
        elif u not in user_key.keys():
            if g in game_key.keys():
                new = missing_user(g)
            else:
                new = median
        actual = user_test[u][g]
        diffies.append(abs(new - actual))
        diffies2.append((new - actual)**2) 

In [52]:
mean_abs_error = np.mean(diffies)
mean_abs_error

0.92737567689880651

In [53]:
RMSD = math.sqrt(sum(diffies2)/len(diffies2))
RMSD

1.1173036738329407

GETTING THE RECOS

In [54]:
temp_user_key = collections.defaultdict(dict)
game_key = collections.defaultdict(dict)

for i in range(len(int_exit_dict)):
    temp_user_key[int_exit_dict[i][1]][int_exit_dict[i][0]] = int_exit_dict[i][2]
    game_key[int_exit_dict[i][0]][int_exit_dict[i][1]] = int_exit_dict[i][2]

In [55]:
all_ratings = []
for u in temp_user_key:
    all_ratings.extend(temp_user_key[u].values())
all_ratings.sort()

median = np.median(all_ratings)

In [56]:
user_key = collections.defaultdict(dict)
game_key = collections.defaultdict(dict)
conv = np.percentile(all_ratings, [20, 40, 60, 80, 100])
for u in temp_user_key:
    for g in temp_user_key[u]:
        rating = temp_user_key[u][g]
        if rating < conv[0]:
            num = 1
        elif rating < conv[1]:
            num = 2
        elif rating < conv[2]:
            num = 3
        elif rating < conv[3]:
            num = 4
        else:
            num = 5
 
        user_key[u][g] = num
        game_key[g][u] = num

In [57]:
def get_recs(user):
    new_game_rates = dict()
    users_games = user_key[user].keys()
    copy_game = game_key.keys()[:]
    for i in copy_game:
        if i in users_games:
            copy_game.remove(i)
    for g in copy_game:
        rate = get_rating(user, g, game_key, user_key)
        new_game_rates[g] = rate
    sorted_rates = sorted(new_game_rates.items(), key=lambda x:x[1], reverse = True)
    recommendations = sorted_t[:40]
    games = [i[0] for i in recommendations]
    return games