In [1]:
import pandas as pd 
import sqlite3
import seaborn as sns
import datetime
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
conn = sqlite3.connect('twitch.db')
sql = """
SELECT *
FROM Twitch_Directory
"""
# Read the sql query into pandas to view the results
twitch_data = pd.read_sql_query(sql, conn)
twitch_data.head()

Unnamed: 0,index,Date,Time,Directory Name,Viewer Count
0,0,2022-01-31,10:45:52,Just Chatting,390431
1,1,2022-01-31,10:45:52,Grand Theft Auto V,239160
2,2,2022-01-31,10:45:52,League of Legends,219827
3,3,2022-01-31,10:45:52,VALORANT,137060
4,4,2022-01-31,10:45:52,Fortnite,115131


In [3]:
twitch_data.dtypes

index              int64
Date              object
Time              object
Directory Name    object
Viewer Count       int64
dtype: object

In [4]:
twitch_data['Directory Name'].unique()

array(['Just Chatting', 'Grand Theft Auto V', 'League of Legends',
       'VALORANT', 'Fortnite', 'Pokémon Legends: Arceus',
       'Counter-Strike: Global Offensive', 'FIFA 22', 'Sports',
       'Apex Legends', 'Dota 2', 'Escape from Tarkov',
       'Dying Light 2: Stay Human', 'Cities: Skylines', 'Rust',
       'Path of Exile', 'Sifu', 'LOST ARK', 'Slots',
       "Tom Clancy's Rainbow Six Siege", 'Minecraft',
       'Call of Duty: Warzone', 'Dead by Daylight', 'Lost Ark',
       'Horizon Forbidden West', 'Mario Kart 8 Deluxe', 'Elden Ring',
       'Destiny 2', 'Politics', 'Rocket League', 'World of Warcraft',
       'Brawl Stars', 'WWE 2K22', 'Call of Duty: Modern Warfare 3',
       "Tiny Tina's Wonderlands", 'F1 2021', 'Hearthstone',
       'Fall Guys: Ultimate Knockout', 'Special Events', 'The Sims 4',
       'The Long Drive', 'PUBG: BATTLEGROUNDS',
       'Final Fantasy XIV Online', 'Overwatch',
       'Holdfast: Nations At War', 'Vampire Survivors', 'Marauders',
       'Evil Dead

In [5]:
twitch_data['Directory Name'].replace({'LOST ARK' : 'Lost Ark'},  inplace=True)

In [6]:
twitch_data['Directory Name'].unique()

array(['Just Chatting', 'Grand Theft Auto V', 'League of Legends',
       'VALORANT', 'Fortnite', 'Pokémon Legends: Arceus',
       'Counter-Strike: Global Offensive', 'FIFA 22', 'Sports',
       'Apex Legends', 'Dota 2', 'Escape from Tarkov',
       'Dying Light 2: Stay Human', 'Cities: Skylines', 'Rust',
       'Path of Exile', 'Sifu', 'Lost Ark', 'Slots',
       "Tom Clancy's Rainbow Six Siege", 'Minecraft',
       'Call of Duty: Warzone', 'Dead by Daylight',
       'Horizon Forbidden West', 'Mario Kart 8 Deluxe', 'Elden Ring',
       'Destiny 2', 'Politics', 'Rocket League', 'World of Warcraft',
       'Brawl Stars', 'WWE 2K22', 'Call of Duty: Modern Warfare 3',
       "Tiny Tina's Wonderlands", 'F1 2021', 'Hearthstone',
       'Fall Guys: Ultimate Knockout', 'Special Events', 'The Sims 4',
       'The Long Drive', 'PUBG: BATTLEGROUNDS',
       'Final Fantasy XIV Online', 'Overwatch',
       'Holdfast: Nations At War', 'Vampire Survivors', 'Marauders',
       'Evil Dead: The Game',

In [7]:
twitch_data.drop(columns=['index'], inplace=True)

In [8]:
twitch_data['Date'] = pd.to_datetime(twitch_data['Date'])

In [9]:
twitch_data['month'] = twitch_data['Date'].dt.strftime('%m')
twitch_data['month'] = twitch_data['month'].astype(int)


In [10]:
twitch_data['Day'] = twitch_data['Date'].dt.strftime('%w')

In [11]:
num_days= {
    '1' : 'Mon', 
    '2' : 'Tue', 
    '3' : 'Wed', 
    '4' : 'Thur', 
    '5' : 'Fri', 
    '6' : 'Sat',
    '0' : 'Sun'
}
twitch_data['Day'].replace(num_days, inplace=True)

In [12]:
twitch_data

Unnamed: 0,Date,Time,Directory Name,Viewer Count,month,Day
0,2022-01-31,10:45:52,Just Chatting,390431,1,Mon
1,2022-01-31,10:45:52,Grand Theft Auto V,239160,1,Mon
2,2022-01-31,10:45:52,League of Legends,219827,1,Mon
3,2022-01-31,10:45:52,VALORANT,137060,1,Mon
4,2022-01-31,10:45:52,Fortnite,115131,1,Mon
...,...,...,...,...,...,...
1105,2022-05-23,10:45:25,Fortnite,69961,5,Mon
1106,2022-05-23,10:45:25,V Rising,64515,5,Mon
1107,2022-05-23,10:45:25,Counter-Strike: Global Offensive,64011,5,Mon
1108,2022-05-23,10:45:25,Dota 2,50749,5,Mon


In [13]:
X = twitch_data.drop(columns=['Date', 'Time', 'Viewer Count'])
y = twitch_data['Viewer Count']

In [14]:
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42)

In [15]:
X_train

Unnamed: 0,Directory Name,month,Day
1043,League of Legends,5,Tue
582,Counter-Strike: Global Offensive,3,Wed
630,Just Chatting,4,Mon
427,Lost Ark,3,Mon
60,Just Chatting,2,Sun
...,...,...,...
466,World of Warcraft,3,Fri
121,Just Chatting,2,Sat
1044,Apex Legends,5,Tue
1095,VALORANT,5,Sun


In [16]:
Ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

ohe_train = X_train[['Directory Name', 'month']]
ohe_test = X_test[['Directory Name', 'month']]

X_train = Ohe.fit_transform(ohe_train)
X_test = Ohe.transform(ohe_test)

In [17]:
X_train

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [18]:
dt = DecisionTreeRegressor()

dt.fit(X_train, y_train)

print(f'Train: {dt.score(X_train, y_train)}')
print(f'Test: {dt.score(X_test, y_test)}')

Train: 0.7247906637597405
Test: 0.6349583075382321


In [19]:
knn = KNeighborsRegressor()

knn.fit(X_train, y_train)

print(f'Train: {knn.score(X_train, y_train)}')
print(f'Test: {knn.score(X_test, y_test)}')

Train: 0.607625739505501
Test: 0.4080236346651933


In [20]:
rf = RandomForestRegressor()

rf.fit(X_train, y_train)

print(f'Train: {rf.score(X_train, y_train)}')
print(f'Test: {rf.score(X_test, y_test)}')

Train: 0.7242713335302822
Test: 0.6370125563149197


In [21]:
dt_params = {
    'max_features' : ['auto', 'sqrt', 'log2'], 
    'splitter' : ['best', 'random'],
    'max_depth' : [5,10,50], 
}


In [22]:
dt_grid = GridSearchCV(DecisionTreeRegressor(), param_grid=dt_params)

dt_grid.fit(X_train, y_train)

dt_grid.best_params_

{'max_depth': 50, 'max_features': 'log2', 'splitter': 'best'}

In [25]:
dt = DecisionTreeRegressor(max_depth= 50,max_features= 'log2',splitter='best')

dt.fit(X_train, y_train)

print(f'Train: {dt.score(X_train, y_train)}')
print(f'Test: {dt.score(X_test, y_test)}')

Train: 0.7247906637597405
Test: 0.6291743923772155
