In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy import sparse
from implicit.als import AlternatingLeastSquares
import numpy as np
from sklearn import metrics

In [2]:
fileName = "jw_player_play_data.csv"

In [3]:
# the file in
table = pd.read_csv(fileName, header=0)

In [4]:
# setting all the categorical variables to categories
table['publisher_category'] = table['publisher_category'].astype("category")
table['device_type'] = table['device_type'].astype("category")
table['country_code'] = table['country_code'].astype("category")
table['operating_system'] = table['operating_system'].astype("category")
table['browser'] = table['browser'].astype("category")
table['publisher'] = table['publisher'].astype("category")
table['viewer_id'] = table['viewer_id'].astype("category")
table['media_id'] = table['media_id'].astype("category")

# then storing id - category mapping as two different lookup tables
# inspiration from here: 
# https://medium.com/towards-data-science/recommending-github-repositories-with-google-bigquery-and-the-implicit-library-e6cce666c77

id_pub_cat = dict(enumerate(table['publisher_category'].cat.categories))
pub_cat_ids = {x: i for i, x in id_pub_cat.iteritems()}

id_device_type = dict(enumerate(table['device_type'].cat.categories))
device_type_ids = {x: i for i, x in id_device_type.iteritems()}

id_country_code = dict(enumerate(table['country_code'].cat.categories))
country_code_ids = {x: i for i, x in id_country_code.iteritems()}

id_operating_system = dict(enumerate(table['operating_system'].cat.categories))
operating_system_ids = {x: i for i, x in id_operating_system.iteritems()}

id_browser = dict(enumerate(table['browser'].cat.categories))
browser_ids = {x: i for i, x in id_browser.iteritems()}

id_publisher = dict(enumerate(table['publisher'].cat.categories))
publisher_ids = {x: i for i, x in id_publisher.iteritems()}

id_media_id = dict(enumerate(table['media_id'].cat.categories))
media_id_ids = {x: i for i, x in id_media_id.iteritems()}

id_viewer_id = dict(enumerate(table['viewer_id'].cat.categories))
viewer_id_ids = {x: i for i, x in id_viewer_id.iteritems()}

# don't actually use this, as it turned out to be not an important feature
# the idea being maybe there was too much variability in the country codes
# so let's try and condense the info down
# also ordered the continents by what I thought might be the most similar
# in taste
country_code_to_continent = {}
cotinent_mapping = {
    "AF" : 0,
    "AS" : 1,
    "OC" : 2,
    "EU" : 3,
    "NA" : 4,
    "SA" : 5,
    "AN" : 6
}
with open("country_codes_to_continent") as f:
    for line in f:
        split = line.strip().split(",")
        country_code_to_continent[split[0]] = split[1]

In [6]:
# Features to do with counts, as the implicit collab filtering package didn't work for me, I tried to come up with
# features revolving around counts.

# As I couldn't get the collab filtering package to recommend clips (didn't want the recommendations, but the scores)
# the ideas was to see how many of clips similar to the one that we need to predict user engagement for had the user
# watched
# again got inspiration from here:
# https://medium.com/towards-data-science/recommending-github-repositories-with-google-bigquery-and-the-implicit-library-e6cce666c77
# Not used
def getSimilarClips(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    user_video_counts = user_video.groupby(['viewer_id', 'media_id']).size().reset_index()
    user_video_counts.columns = ['viewer_id', 'media_id', 'counts']
    user_video_counts['viewer_id'] = user_video_counts['viewer_id'].astype("category")
    user_video_counts['media_id'] = user_video_counts['media_id'].astype("category")

    plays = sparse.csr_matrix((user_video_counts["counts"].astype("float"),
                               (user_video_counts["media_id"].cat.codes.copy(), 
                                user_video_counts["viewer_id"].cat.codes.copy())))

    collab_model = AlternatingLeastSquares(factors = 100, iterations = 50)
    collab_model.fit(40*plays)

    similar_items = {}
    for media_id in media_id_ids:
        similar_items[media_id] = set([id_media_id[v] for v,s in collab_model.similar_items(media_id_ids[media_id], N=100)])

    return similar_items

# For each video a user has watched, get the number of times the user has watched it
# Not used
def getUserVideoCounts(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    user_video_counts = user_video.groupby(['viewer_id', 'media_id']).size().reset_index()
    user_video_counts.columns = ['viewer_id', 'media_id', 'counts']
    user_video_counts_dic = {}
    for row in user_video_counts.to_dict(orient='records'):
        user = row["viewer_id"]
        video = row["media_id"]
        count = row["counts"]
        if user in user_video_counts_dic:
            user_video_counts_dic[user][video] = count
        else:
            user_video_counts_dic[user] = {}
            user_video_counts_dic[user][video] = count

    return user_video_counts_dic

# For each video get the number of times it has been watched it
# Used
def getVideoCounts(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    video_counts = user_video.groupby(['media_id']).size().reset_index()
    video_counts.columns = ['media_id', 'counts']

    video_counts_dic = {}
    for row in video_counts.to_dict(orient='records'):
        video_counts_dic[row["media_id"]] = row["counts"]
    
    return video_counts_dic

# For each user get the number of videos the user has watched
# Not used
def getUserCounts(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    user_counts = user_video.groupby(['viewer_id']).size().reset_index()
    user_counts.columns = ['viewer_id', 'counts']

    user_counts_dic = {}
    for row in user_counts.to_dict(orient='records'):
        user_counts_dic[row["viewer_id"]] = row["counts"]
    
    return user_counts_dic

In [8]:
# The idea here was to use some combinations of the mean, std and median watched_pct for each video, in order to help
# the model
# These turned out to be good
def getVideoPercetageStats(trainingDF):
    video_percents = trainingDF[["media_id", "watched_pct"]]

    video_percents_average = video_percents.groupby(["media_id"]).mean().reset_index()
    video_percents_average.columns = ['media_id', 'avg']

    video_percents_average_dict = {}
    for row in video_percents_average.to_dict(orient='records'):
        if row['avg'] and str(row['avg']) != 'nan':
            video_percents_average_dict[row['media_id']] = row['avg']
        else:
            video_percents_average_dict[row['media_id']] = 0

    video_percents_std = video_percents.groupby(["media_id"]).std().reset_index()
    video_percents_std.columns = ['media_id', 'std']

    video_percents_std_dict = {}
    for row in video_percents_std.to_dict(orient='records'):
        if row['std'] and str(row['std']) != 'nan':
            video_percents_std_dict[row['media_id']] = row['std']
        else:
            video_percents_std_dict[row['media_id']] = 0

    video_percents_median = video_percents.groupby(["media_id"]).median().reset_index()
    video_percents_median.columns = ['media_id', 'median']

    video_percents_median_dict = {}
    for row in video_percents_median.to_dict(orient='records'):
        if row['median'] and str(row['median']) != 'nan':
            video_percents_median_dict[row['media_id']] = row['median']
        else:
            video_percents_median_dict[row['media_id']] = 0
    
    return video_percents_average_dict, video_percents_std_dict, video_percents_median_dict

In [9]:
# The idea here was to use some combinations of the mean, std and median watched_pct for each user, in order to help
# the model
# These turned out to be good
def getUserPercetageStats(trainingDF):
    user_percents = table_train[["viewer_id", "watched_pct"]]

    user_percents_average = user_percents.groupby(["viewer_id"]).mean().reset_index()
    user_percents_average.columns = ['viewer_id', 'avg']

    user_percents_average_dict = {}
    for row in user_percents_average.to_dict(orient='records'):
        if row['avg'] and str(row['avg']) != 'nan':
            user_percents_average_dict[row['viewer_id']] = row['avg']
        else:
            user_percents_average_dict[row['viewer_id']] = 0

    user_percents_std = user_percents.groupby(["viewer_id"]).std().reset_index()
    user_percents_std.columns = ['viewer_id', 'std']

    user_percents_std_dict = {}
    for row in user_percents_std.to_dict(orient='records'):
        if row['std'] and str(row['std']) != 'nan':
            user_percents_std_dict[row['viewer_id']] = row['std']
        else:
            user_percents_std_dict[row['viewer_id']] = 0

    user_percents_median = user_percents.groupby(["viewer_id"]).median().reset_index()
    user_percents_median.columns = ['viewer_id', 'median']

    user_percents_median_dict = {}
    for row in user_percents_median.to_dict(orient='records'):
        if row['median'] and str(row['median']) != 'nan':
            user_percents_median_dict[row['viewer_id']] = row['median']
        else:
            user_percents_median_dict[row['viewer_id']] = 0
    
    return user_percents_average_dict, user_percents_std_dict, user_percents_median_dict

In [16]:
# Function that actually creates the features to be used in the model to predict watched_pct
def createFeature(row, video_counts_dic, video_percents_average_dict, video_percents_std_dict,
                  user_percents_average_dict, user_percents_std_dict):
    feature = []
#     the country code turned out to be okay
    if row["country_code"] in country_code_ids:
        feature.append(country_code_ids[row["country_code"]])
    else:
        feature.append(-1)
#     the publisher info
    if row["publisher"] in publisher_ids:
        feature.append(publisher_ids[row["publisher"]])
    else:
        feature.append(-1)
#     the number of times the video had been watched
    if row["media_id"] in video_counts_dic:
        feature.append(video_counts_dic[row["media_id"]])
    else:
        feature.append(0)
#     coefficient of variation of watched_pct for the user
    if user_percents_average_dict[row["viewer_id"]] > 0 :
        feature.append(user_percents_std_dict[row["viewer_id"]] / user_percents_average_dict[row["viewer_id"]])
    else:
        feature.append(0)
#     coefficient of variation of watched_pct for the video
    if video_percents_average_dict[row["media_id"]] > 0 :
        feature.append(video_percents_std_dict[row["media_id"]] / video_percents_average_dict[row["media_id"]])
    else:
        feature.append(0)
    
#     the actual video id
    feature.append(media_id_ids[row["media_id"]])
    
#     the actual viewer id
    feature.append(viewer_id_ids[row["viewer_id"]])
    
    return feature

In [17]:
results = []

for i in range(10):
    msk = np.random.rand(len(table)) < 0.8
    table_train = table[msk]
    table_test = table[~msk]
    
#     only using the training data here
    video_counts_dic = getVideoCounts(table_train)
    
    video_percents_average_dict, video_percents_std_dict, video_percents_median_dict = getVideoPercetageStats(table_train)
    
    user_percents_average_dict, user_percents_std_dict, user_percents_median_dict = getUserPercetageStats(table_train)
    
    X_train = []
    y_train = []
    for row in table_train.to_dict(orient='records'):
        X_train.append(createFeature(row, video_counts_dic, video_percents_average_dict, video_percents_std_dict,
                                    user_percents_average_dict, user_percents_std_dict))
        y_train.append(row["watched_pct"])

    X_test = []
    y_test = []
    for row in table_test.to_dict(orient='records'):
        X_test.append(createFeature(row, video_counts_dic, video_percents_average_dict, video_percents_std_dict,
                                    user_percents_average_dict, user_percents_std_dict))
        y_test.append(row["watched_pct"])

#     I tried regular regression, SVR, AdaBoostRegresor and RandomForestRegessor 
#     RandomForestRegressor turned out to be the best
    model = RandomForestRegressor(n_estimators = 50)
    model.fit(X_train, y_train)
    results.append(model.score(X_test, y_test))
    print i

0
1
2
3
4
5
6
7
8
9


In [18]:
print "R^2 for the model is: "
print sum(results)/float(len(results))

R^2 for the model is: 
0.192897994484
