# EXPERIMENT

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

def loadFile(googleTrue, file = None ):
    if googleTrue == True:
        from google.colab import drive
        drive.mount('/content/drive')
        x = pd.read_csv("drive/MyDrive/train_data.csv")
    else:
        x = pd.read_csv(file)
    return x

In [3]:
train_data = loadFile(False, "data/train_data.csv")
test_data = loadFile(False, "data/test_data.csv")
supp_data = loadFile(False, "data/supplimental_data.csv")

In [4]:
def binaryQuestionTiming(df):
    df.loc[:, "QuestionTiming"] = df.loc[:, "QuestionTiming"].astype(str)
    df.loc[:, "QuestionTiming"] = df.loc[:, "QuestionTiming"].apply(lambda x: 1 if x == "System Initiated" else 0)
    return df
    
def binaryCurrentGameMode(df):
    df.loc[:, "CurrentGameMode"] = df.loc[:, "CurrentGameMode"].astype(str)
    df.loc[:, "CurrentGameMode"] = df.loc[:, "CurrentGameMode"].apply(lambda x: 1 if x == "Career" else 0)
    return df

def numericalTime(df):
    df.loc[:, "TimeUtcX"] = pd.to_datetime(df.loc[:, "TimeUtc"])
    df.loc[:, "TimeUtcY"] = pd.to_numeric(df.loc[:, "TimeUtcX"])
    df = df.drop(columns= ["TimeUtc", "TimeUtcX"])
    return df

def fillLPA(df):
    df.loc[:, "LevelProgressionAmount"] = df.loc[:, "LevelProgressionAmount"].apply(lambda x: 0 if pd.isna(x) else x)
    return df

In [5]:
FEATURES = ["UserID", "TimeUtc", "QuestionTiming", "CurrentGameMode", "CurrentSessionLength", "LevelProgressionAmount", "ResponseValue"]
X = train_data[FEATURES]
X = binaryQuestionTiming(X)
X = binaryCurrentGameMode(X)
X = numericalTime(X)
X = fillLPA(X)

## Learning Algorithms

In [7]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn import tree
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [8]:
grouped = X.groupby("UserID")
features = ["TimeUtcY", "QuestionTiming", "CurrentGameMode", "CurrentSessionLength", "LevelProgressionAmount"]
target = ["ResponseValue"]
userDf = {user_id: group for user_id, group in grouped}

In [9]:
trainData, testData = pd.DataFrame(), pd.DataFrame()

for key in userDf.values():
    data = pd.DataFrame(key)
    dataSize = int(len(data))
    trainSize = int(dataSize // 1.2)
    trainData = pd.concat([trainData, data.iloc[: trainSize]])
    testData = pd.concat([testData, data.iloc[trainSize:]])
                
X_train, y_train, X_test, y_test = trainData[features], trainData[target], testData[features], testData[target]
reg0 = tree.DecisionTreeRegressor(criterion = "squared_error", max_depth = 5) # squared_error because I could do hand calculations faster than absolute_error
reg0 = reg0.fit(X_train, y_train)
reg0Pred = reg0.predict(X_test)

In [10]:
reg0Mae = mean_absolute_error(y_test, reg0Pred)
reg0Mse = mean_squared_error(y_test, reg0Pred)

In [11]:
preds = []
tests = []
models = {}

for key in userDf.values():
    data = pd.DataFrame(key)
    UserID = data.iloc[0][0]
    data = data.reset_index()
    dataSize = int(len(data))
    if dataSize < 1:
        preds.extend([500])
        tests.extend(data["ResponseValue"])
    else:
        trainSize = dataSize // 1.3
        X, y = data[features], data[target]
        X_train, y_train , X_test, y_test = X.loc[:trainSize], y.loc[:trainSize], X.loc[trainSize:], y.loc[trainSize:]
        reg1 = tree.DecisionTreeRegressor(criterion = "squared_error", max_depth = 5) # squared_error because I could do hand calculations faster than absolute_error
        reg1 = reg1.fit(X_train, y_train)
        
        preds.extend(reg1.predict(X_test))
        tests.extend(list(y_test["ResponseValue"]))
        models.update({UserID: reg1})

In [12]:
reg1Mae = mean_absolute_error(tests, preds)
reg1Mse= mean_squared_error(tests, preds)

In [13]:
for UserID, model in models.items():
    featureImportance = model.feature_importances_
    models.update({UserID: featureImportance})

In [14]:
models = pd.DataFrame(models).T

In [15]:
models = models.rename(columns={0: "TimeUtcY", 1 : "QuestionTiming", 2: "GameMode", 3 : "SessionLength", 4 : "LevelProgressionAmount"})
models.sum()

TimeUtcY                  3132.935896
QuestionTiming             243.722923
GameMode                   242.683321
SessionLength             2002.649861
LevelProgressionAmount    1598.007999
dtype: float64

In [16]:
print(f" General Model    : MAE {round(reg0Mae, 2)}, MSE {round(reg0Mse, 2)}")
print(f" User Unique Model: MAE {round(reg1Mae, 2)}, MSE {round(reg1Mse, 2)}")

 General Model    : MAE 173.43, MSE 45992.14
 User Unique Model: MAE 83.81, MSE 23519.21


## Clustering

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
FEATURES = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "ResponseValue"]
X = train_data[FEATURES]
X = numericalTime(X)
X = fillLPA(X)
standard = StandardScaler().set_output(transform = "pandas")

In [19]:
sumX = pd.DataFrame(columns = ["UserID", "SessionLengthSum", "LevelProgressionSum"])
i = 0
for UserID, UserData in userDf.items():
    SessionLengthSum = UserData["CurrentSessionLength"].sum()
    LevelProgressionSum = UserData["LevelProgressionAmount"].sum()
    data = {"UserID": UserID, "SessionLengthSum" : SessionLengthSum, "LevelProgressionSum" : LevelProgressionSum}
    data = pd.DataFrame(data, index = [i])
    sumX = pd.concat([sumX, data])
    i += 1

In [20]:
sumX = sumX.set_index("UserID")
sumX

Unnamed: 0_level_0,SessionLengthSum,LevelProgressionSum
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1
p1,333,9.236761
p10,16894,43.968389
p100,1,0.000000
p10002,558,6.807661
p10003,398,5.890150
...,...,...
p9995,420,6.100568
p9996,162,1.536824
p9997,2,0.000000
p9998,2128,9.227791


In [21]:
cols = ["SessionLengthSum", "LevelProgressionSum"] 
standX = standard.fit_transform(sumX[cols])

In [22]:
db = DBSCAN(eps = 0.05, min_samples = 15).fit(standX)
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points : %d" % n_noise_)

Estimated number of clusters: 4
Estimated number of noise points : 1909


In [23]:
check = standX.reset_index()
db = DBSCAN(eps = 0.05, min_samples = 15).fit_predict(standX)
db0 = pd.Series(db, name = "Cluster")
two = check.join(db0, how = "left")
two = two.drop(columns = ["SessionLengthSum", "LevelProgressionSum"])

In [24]:
merged = X.merge(two)

In [25]:
merged[merged["Cluster"] == 2]

Unnamed: 0,UserID,CurrentSessionLength,LevelProgressionAmount,ResponseValue,TimeUtcY,Cluster
988,p10126,31,0.760935,861.0,1660854875000000000,2
989,p10126,0,0.000000,931.0,1660912742000000000,2
990,p10126,18,0.524482,982.0,1660913802000000000,2
991,p10126,37,0.044208,1000.0,1660914964000000000,2
992,p10126,43,0.130842,1000.0,1660915335000000000,2
...,...,...,...,...,...,...
129549,p6747,24,0.212790,972.0,1673485994000000000,2
129550,p6747,70,0.681911,981.0,1673488723000000000,2
129551,p6747,88,0.892457,969.0,1673489791000000000,2
129552,p6747,97,0.952305,976.0,1673490358000000000,2


In [26]:
preds = []
tests = []
features = ["CurrentSessionLength", "LevelProgressionAmount", "ResponseValue", "TimeUtcY"]
target = ["ResponseValue"]
n_cluster = merged["Cluster"].max()
i = -1
df = merged.drop(columns = "UserID")

while i <= n_cluster:
    data = df[df["Cluster"] == i]
    data = data.reset_index()
    dataSize = int(len(data))
    trainSize = dataSize // 1.3
    X, y = data[features], data[target]
    X_train, y_train , X_test, y_test = X.loc[:trainSize], y.loc[:trainSize], X.loc[trainSize:], y.loc[trainSize:]
    reg2 = tree.DecisionTreeRegressor(criterion = "squared_error", max_depth = 5) # squared_error because I could do hand calculations faster than absolute_error
    reg2 = reg2.fit(X_train, y_train)
    preds.extend(reg1.predict(X_test))
    tests.extend(list(y_test["ResponseValue"]))
    i += 1

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- ResponseValue
Feature names seen at fit time, yet now missing:
- CurrentGameMode
- QuestionTiming


In [None]:
mean_absolute_error(tests, preds)