In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from scipy import sparse
from implicit.als import AlternatingLeastSquares
import numpy as np
from sklearn import metrics
import ipdb

In [2]:
fileName = "jw_player_play_data.csv"

In [3]:
# the file in
table = pd.read_csv(fileName, header=0)

In [57]:
# setting all the categorical variables to categories
table['publisher_category'] = table['publisher_category'].astype("category")
table['device_type'] = table['device_type'].astype("category")
table['country_code'] = table['country_code'].astype("category")
table['operating_system'] = table['operating_system'].astype("category")
table['browser'] = table['browser'].astype("category")
table['publisher'] = table['publisher'].astype("category")
table['viewer_id'] = table['viewer_id'].astype("category")
table['media_id'] = table['media_id'].astype("category")

# then storing id - category mapping as two different lookup tables
# inspiration from here: 
# https://medium.com/towards-data-science/recommending-github-repositories-with-google-bigquery-and-the-implicit-library-e6cce666c77

id_pub_cat = dict(enumerate(table['publisher_category'].cat.categories))
pub_cat_ids = {x: i for i, x in id_pub_cat.iteritems()}

id_device_type = dict(enumerate(table['device_type'].cat.categories))
device_type_ids = {x: i for i, x in id_device_type.iteritems()}

id_country_code = dict(enumerate(table['country_code'].cat.categories))
country_code_ids = {x: i for i, x in id_country_code.iteritems()}

id_operating_system = dict(enumerate(table['operating_system'].cat.categories))
operating_system_ids = {x: i for i, x in id_operating_system.iteritems()}

id_browser = dict(enumerate(table['browser'].cat.categories))
browser_ids = {x: i for i, x in id_browser.iteritems()}

id_publisher = dict(enumerate(table['publisher'].cat.categories))
publisher_ids = {x: i for i, x in id_publisher.iteritems()}

id_media_id = dict(enumerate(table['media_id'].cat.categories))
media_id_ids = {x: i for i, x in id_media_id.iteritems()}

id_viewer_id = dict(enumerate(table['viewer_id'].cat.categories))
viewer_id_ids = {x: i for i, x in id_viewer_id.iteritems()}

# the idea was to combat the variability in the country codes
# an attempt to condense the info down
# also ordered the continents by what I thought might be the most similar
# in taste
country_code_to_continent = {}
cotinent_mapping = {
    "AF" : 0,
    "AS" : 1,
    "OC" : 2,
    "EU" : 3,
    "NA" : 4,
    "SA" : 5,
    "AN" : 6
}
with open("country_codes_to_continent") as f:
    for line in f:
        split = line.strip().split(",")
        country_code_to_continent[split[0]] = split[1]




In [6]:
# Features to do with counts, as the implicit collab filtering package didn't work for me, I tried to come up with
# features revolving around counts.

# As I couldn't get the collab filtering package to recommend clips (didn't want the recommendations, but the scores)
# the ideas was to see how many of clips similar to the one that we need to predict user engagement for had the user
# watched
# again got inspiration from here:
# https://medium.com/towards-data-science/recommending-github-repositories-with-google-bigquery-and-the-implicit-library-e6cce666c77
def getSimilarClips(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    user_video_counts = user_video.groupby(['viewer_id', 'media_id']).size().reset_index()
    user_video_counts.columns = ['viewer_id', 'media_id', 'counts']
    user_video_counts['viewer_id'] = user_video_counts['viewer_id'].astype("category")
    user_video_counts['media_id'] = user_video_counts['media_id'].astype("category")

    plays = sparse.csr_matrix((user_video_counts["counts"].astype("float"),
                               (user_video_counts["media_id"].cat.codes.copy(), 
                                user_video_counts["viewer_id"].cat.codes.copy())))

    collab_model = AlternatingLeastSquares(factors = 100, iterations = 50)
    collab_model.fit(40*plays)

    similar_items = {}
    for media_id in media_id_ids:
        similar_items[media_id] = set([id_media_id[v] for v,s in collab_model.similar_items(media_id_ids[media_id], N=100)])

    return similar_items

# For each video a user has watched, get the number of times the user has watched it
def getUserVideoCounts(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    user_video_counts = user_video.groupby(['viewer_id', 'media_id']).size().reset_index()
    user_video_counts.columns = ['viewer_id', 'media_id', 'counts']
    user_video_counts_dic = {}
    for row in user_video_counts.to_dict(orient='records'):
        user = row["viewer_id"]
        video = row["media_id"]
        count = row["counts"]
        if user in user_video_counts_dic:
            user_video_counts_dic[user][video] = count
        else:
            user_video_counts_dic[user] = {}
            user_video_counts_dic[user][video] = count

    return user_video_counts_dic

# For each video get the number of times it has been watched it
def getVideoCounts(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    video_counts = user_video.groupby(['media_id']).size().reset_index()
    video_counts.columns = ['media_id', 'counts']

    video_counts_dic = {}
    for row in video_counts.to_dict(orient='records'):
        video_counts_dic[row["media_id"]] = row["counts"]
    
    return video_counts_dic

# For each user get the number of videos the user has watched
def getUserCounts(trainingDF):
    user_video = trainingDF[['viewer_id', 'media_id']]
    user_counts = user_video.groupby(['viewer_id']).size().reset_index()
    user_counts.columns = ['viewer_id', 'counts']

    user_counts_dic = {}
    for row in user_counts.to_dict(orient='records'):
        user_counts_dic[row["viewer_id"]] = row["counts"]
    
    return user_counts_dic

In [42]:
# The idea here was to use some combinations of the mean, std and median watched_pct for each video, in order to help
# the model
# Turned out that std was the most useful
def getVideoPercetageStats(trainingDF):
    video_percents = trainingDF[["media_id", "watched_pct"]]

    video_percents_std = video_percents.groupby(["media_id"]).std().reset_index()
    video_percents_std.columns = ['media_id', 'std']

    video_percents_std_dict = {}
    for row in video_percents_std.to_dict(orient='records'):
        if row['std'] and str(row['std']) != 'nan':
            video_percents_std_dict[row['media_id']] = row['std']
        else:
            video_percents_std_dict[row['media_id']] = 0
    
    return video_percents_std_dict

In [43]:
# The idea here was to use some combinations of the mean, std and median watched_pct for each user, in order to help
# the model
# Turned out that std was the most useful
def getUserPercetageStats(trainingDF):
    user_percents = table_train[["viewer_id", "watched_pct"]]

    user_percents_std = user_percents.groupby(["viewer_id"]).std().reset_index()
    user_percents_std.columns = ['viewer_id', 'std']

    user_percents_std_dict = {}
    for row in user_percents_std.to_dict(orient='records'):
        if row['std'] and str(row['std']) != 'nan':
            user_percents_std_dict[row['viewer_id']] = row['std']
        else:
            user_percents_std_dict[row['viewer_id']] = 0
    
    return user_percents_std_dict

In [59]:
def intersectingWatchHistoryWithSimilarItems(user_id, media_id, user_video_counts_dic, similar_items):
    if user_id in user_video_counts_dic and media_id in similar_items:
        user_history = set(user_video_counts_dic[user_id].keys())
        if len(user_history):
            return len(similar_items[media_id].intersection(user_history)) / float(len(similar_items[media_id]))
    else:
        return 0

In [63]:
# Function that actually creates the features to be used in the model to predict watched_pct
def createFeature(row, user_video_counts_dic, video_counts_dic, similar_items,
                  video_percents_std_dict, user_percents_std_dict, user_counts_dic):
    feature = []

    feature.append(row["ad_skips"])

    if row["browser"] in browser_ids:
        feature.append(browser_ids[row["browser"]])
    else:
        feature.append(-1)

    if row["country_code"] in country_code_ids:
        feature.append(country_code_ids[row["country_code"]])
    else:
        feature.append(-1)

    if row["device_type"] in device_type_ids:
        feature.append(device_type_ids[row["device_type"]])
    else:
        feature.append(-1)
   
    feature.append(media_id_ids[row["media_id"]])
    
    feature.append(viewer_id_ids[row["viewer_id"]])
    
    if row["operating_system"] in operating_system_ids:
        feature.append(operating_system_ids[row["operating_system"]])
    else:
        feature.append(-1)

    feature.append(row["play_seq"])
    
    if row["publisher"] in publisher_ids:
        feature.append(publisher_ids[row["publisher"]])
    else:
        feature.append(-1)

    if row["publisher_category"] in pub_cat_ids:
        feature.append(pub_cat_ids[row["publisher_category"]])
    else:
        feature.append(-1)

#     time from epoch in seconds... not the best feature as didn't include timezone info
    time = datetime.strptime(row["start_time"], '%Y-%m-%d %H:%M:%S')
    feature.append((time-datetime(1970,1,1)).total_seconds())

#     number of times viewer watched the video
    if row["viewer_id"] in user_video_counts_dic and row["media_id"] in user_video_counts_dic[row["viewer_id"]]:
        feature.append(user_video_counts_dic[row["viewer_id"]][row["media_id"]])
    else:
        feature.append(0)
        
#   number of times the video has been watched
    if row["media_id"] in video_counts_dic:
        feature.append(video_counts_dic[row["media_id"]])
    else:
        feature.append(0)
#   continent label
    if row["country_code"] in country_code_to_continent:
        feature.append(cotinent_mapping[country_code_to_continent[row["country_code"]]])
    else:
        feature.append(-1)

#     % of similar items that the user has watched
    feature.append(intersectingWatchHistoryWithSimilarItems(row["viewer_id"], row["media_id"], user_video_counts_dic,
                                                           similar_items))
#     std of user's watched_pct over all videos in training set
    feature.append(user_percents_std_dict[row["viewer_id"]])
    
#     std of video's watched_pct over all users in training set
    feature.append(video_percents_std_dict[row["media_id"]])

#     number of videos the user has watched
    feature.append(user_counts_dic[row["viewer_id"]])

    return feature
        

In [64]:
rSquared = []
adjustedRSquared = []
for i in range(10):
    msk = np.random.rand(len(table)) < 0.8
    table_train = table[msk]
    table_test = table[~msk]
    
#     only using the training data here
    
    user_video_counts_dic = getUserVideoCounts(table_train)
    
    video_counts_dic = getVideoCounts(table_train)
    
    similar_clips = getSimilarClips(table_train)
    
    video_percents_std_dict = getVideoPercetageStats(table_train)
    
    user_percents_std_dict = getUserPercetageStats(table_train)
    
    user_counts_dic = getUserCounts(table_train)
    
    X_train = []
    y_train = []
    for row in table_train.to_dict(orient='records'):
        X_train.append(createFeature(row, user_video_counts_dic, video_counts_dic, similar_clips,
                                    video_percents_std_dict, user_percents_std_dict, user_counts_dic))
        y_train.append(row["watched_pct"])

    X_test = []
    y_test = []
    for row in table_test.to_dict(orient='records'):
        X_test.append(createFeature(row, user_video_counts_dic, video_counts_dic, similar_clips,
                                    video_percents_std_dict, user_percents_std_dict, user_counts_dic))
        y_test.append(row["watched_pct"])

#     I tried regular regression, SVR, AdaBoostRegresor and RandomForestRegessor 
#     RandomForestRegressor turned out to be the best
    model = RandomForestRegressor(n_estimators = 50)
    model.fit(X_train, y_train)
    rSquare = model.score(X_test, y_test)
    rSquared.append(rSquare)
    adjusted = 1 - (1-rSquare)*(len(y_test)-1)/(len(y_test)-len(X_test[0])-1)
    adjustedRSquared.append(adjusted)
    print i

0
1
2
3
4
5
6
7
8
9


In [65]:
print "R^2 for the model is: "
print sum(rSquared)/float(len(rSquared))

R^2 for the model is: 
0.312387405151


In [66]:
print "adjusted R^2 for the model is: "
print sum(adjustedRSquared)/float(len(adjustedRSquared))

adjusted R^2 for the model is: 
0.31222388487
