In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import defaultdict

from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
import sklearn
from sklearn import linear_model
from sklearn import svm
from sklearn import model_selection

In [2]:
twitch_little_data = pd.read_csv('100k_a.csv', header = None, names = ['user_id', 'stream_id', 'streamer_username', 'time_start', 'time_stop'])
twitch_little_data.head()

Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594


In [3]:
twitch_little_data['time_start'] *= 10 
twitch_little_data['time_stop'] *= 10 

twitch_little_data['time_watched'] = (twitch_little_data['time_stop'] - twitch_little_data['time_start'])

twitch_little_data['relative_time_start'] = twitch_little_data['time_start'] % (24*60)
twitch_little_data['relative_time_stop'] = twitch_little_data['time_stop'] % (24*60)

twitch_little_data['relative_hour_start'] = twitch_little_data['relative_time_start']  // 60
twitch_little_data['relative_hour_stop'] = twitch_little_data['relative_time_stop']  // 60

twitch_little_data['day_start'] = twitch_little_data['time_start'] // (24*60)
twitch_little_data['day_stop'] = twitch_little_data['time_stop'] // (24*60)

twitch_little_data['week_start'] = twitch_little_data['day_start'] % 7 + 1
twitch_little_data['week_stop'] = twitch_little_data['day_stop'] % 7 + 1

In [4]:
dataTrain = twitch_little_data[twitch_little_data['time_stop'] < 50000].sort_values('time_start')
dataTest = twitch_little_data[twitch_little_data['time_stop'] >= 50000].sort_values('time_start')
dataTrain

Unnamed: 0,user_id,stream_id,streamer_username,time_start,time_stop,time_watched,relative_time_start,relative_time_stop,relative_hour_start,relative_hour_stop,day_start,day_stop,week_start,week_stop
247782,8359,33824437872,break,0,10,10,0,10,0,0,0,0,1,1
1227245,40442,33825299296,yassuo,0,30,30,0,30,0,0,0,0,1,1
2059791,67966,33825211072,handongsuk,0,20,20,0,20,0,0,0,0,1,1
2960476,97055,33827518864,lirik,0,40,40,0,40,0,0,0,0,1,1
644264,21343,33828189312,paynewitch,0,20,20,0,20,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325873,43802,34305045888,wkgml,49980,49990,10,1020,1030,17,17,34,34,7,7
1667312,54988,34305574208,s1mple,49980,49990,10,1020,1030,17,17,34,34,7,7
1269490,41860,34306970720,kendinemuzisyen,49980,49990,10,1020,1030,17,17,34,34,7,7
2937692,96260,34306924640,electrokidi,49980,49990,10,1020,1030,17,17,34,34,7,7


In [5]:
time_sort = dataTrain.to_numpy()
user_interactions = defaultdict(list)
for interaction in time_sort:
    user = interaction[0]
    streamer = interaction[2]
    time_watched = interaction[5]//10
    for _ in range(time_watched):
        user_interactions[user].append(streamer)
user_interactions

defaultdict(list,
            {8359: ['break',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'slayerage',
              'teawrex',
              'teawrex',
              'teawrex',
              'teawrex',
              'teawrex',
              'grimmmz',
              'grimmmz',
              'grimmmz',
              'grimmmz',
              'sodapoppin',
              'sodapoppin',
              'sodapoppin',
              'sodapoppin',
              'sodapop

In [6]:
tokens = list(dataTrain['streamer_username'].unique())
tokenizer = Tokenizer()
tokenizer.fit_on_texts([tokens])
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))
sequence_data = tokenizer.texts_to_sequences([tokens])[0]
sequence_data[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [7]:
n = 10 #any larger cuts out a large portion of data points an d artificially raises accuracy by focusing more on frequent viewers who are more likely to binge a stream
corpus = []
for user, interactions in user_interactions.items():
    corpus.append([item for sublist in tokenizer.texts_to_sequences(list(interactions)) for item in sublist])
Xtrain = []
ytrain = []
for user in corpus:
    for i in range(n,len(user)):
        Xtrain.append(user[i-n:i])
        ytrain.append(user[i])

time_sort = dataTest.to_numpy()
user_interactions = defaultdict(list)
for interaction in time_sort:
    user = interaction[0]
    streamer = interaction[2]
    time_watched = interaction[5]//10
    for _ in range(time_watched):
        user_interactions[user].append(streamer)
corpus = []
for user, interactions in user_interactions.items():
    corpus.append([item for sublist in tokenizer.texts_to_sequences(list(interactions)) for item in sublist])
Xtest = []
ytest = []
for user in corpus:
    for i in range(n,len(user)):
        Xtest.append(user[i-n:i])
        ytest.append(user[i])
print(len(Xtrain), len(Xtest))

5885942 1170592


In [8]:
clf = linear_model.Ridge()
clf.fit(Xtrain,ytrain)
clf.score(Xtest,ytest)

0.6133784856340575

In [9]:
#Ridge Regression
parameters = {
    'alpha' : [0.1, 1, 10], 
    'fit_intercept' : [False, True]
}
grid = model_selection.GridSearchCV(clf, parameters)
grid.fit(Xtrain, ytrain)
grid.score(Xtest,ytest)

0.6133784856340575

In [None]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(Xtrain[:len(Xtrain)//2], ytrain[:len(Xtrain)//2])

[89,
 173,
 80,
 43,
 214,
 133,
 131,
 120,
 113,
 247,
 351,
 146,
 250,
 44,
 109,
 166,
 64,
 163,
 148,
 142,
 180,
 58,
 83,
 88,
 79,
 150,
 386,
 41,
 118,
 102,
 91,
 191,
 147,
 193,
 36,
 136,
 72,
 43,
 231,
 51,
 103,
 53,
 28,
 174,
 88,
 124,
 127,
 168,
 34,
 65,
 105,
 124,
 76,
 99,
 166,
 90,
 66,
 85,
 76,
 29,
 72,
 110,
 58,
 111,
 121,
 105,
 166,
 94,
 59,
 191,
 97,
 44,
 121,
 240,
 236,
 89,
 422,
 40,
 171,
 122,
 144,
 44,
 101,
 100,
 81,
 55,
 54,
 133,
 107,
 20,
 169,
 77,
 48,
 64,
 40,
 65,
 60,
 63,
 274,
 28,
 41,
 78,
 41,
 44,
 31,
 88,
 102,
 80,
 25,
 135,
 61,
 195,
 3,
 56,
 115,
 49,
 60,
 26,
 15,
 71,
 59,
 30,
 31,
 24,
 26,
 68,
 163,
 24,
 96,
 93,
 75,
 59,
 80,
 94,
 44,
 170,
 165,
 32,
 103,
 24,
 74,
 82,
 59,
 69,
 107,
 19,
 133,
 108,
 75,
 55,
 95,
 200,
 19,
 125,
 25,
 94,
 74,
 157,
 98,
 12,
 106,
 69,
 74,
 78,
 103,
 29,
 168,
 25,
 26,
 121,
 34,
 119,
 41,
 19,
 35,
 64,
 150,
 70,
 150,
 20,
 151,
 228,
 19,
 31,
 1,
 3