# CLUSTER / LEARN

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

def loadFile(googleTrue, file = None ):
    if googleTrue == True:
        from google.colab import drive
        drive.mount('/content/drive')
        x = pd.read_csv("drive/MyDrive/train_data.csv")
    else:
        x = pd.read_csv(file)
    return x

In [3]:
train_data = loadFile(False, "data/train_data.csv")
test_data = loadFile(False, "data/test_data.csv")
supp_data = loadFile(False, "data/supplimental_data.csv")

In [80]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn import tree
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Functions
Time converted to numerical because most models do not support datetime. Missing values in levelprogression were filled with 0 because most NA's also had sessionlength 0. sumDf creates summary statistics for each user which are used to train the kmeans clustering
algorithm. In previous iterations, currentgamemode was converted to a binary with career being 1 and all other values 0, due to class imbalance that was the best. QuestionTiming was also converted to binary but these did not help much either. 

In [6]:
def numericalTime(df):
    df.loc[:, "TimeUtcX"] = pd.to_datetime(df.loc[:, "TimeUtc"])
    df.loc[:, "TimeUtcY"] = pd.to_numeric(df.loc[:, "TimeUtcX"])
    df = df.drop(columns= ["TimeUtc", "TimeUtcX"])
    return df

In [7]:
def fillLPA(df):
    df.loc[:, "LevelProgressionAmount"] = df.loc[:, "LevelProgressionAmount"].apply(lambda x: 0 if pd.isna(x) else x)
    return df

In [94]:
def scaleSessionLength(df):
    mm = MinMaxScaler().set_output(transform = "pandas")
    df["CurrentSessionLength"] = df.groupby("UserID")["CurrentSessionLength"].transform(lambda x: mm.fit_transform())
    return df

In [8]:
def sumDf(df):
    grouped = df.groupby("UserID")
    userDf = {user_id: group for user_id, group in grouped}
    grouped = df.groupby("UserID")
    userDf = {user_id: group for user_id, group in grouped}
    sumDf = pd.DataFrame(columns = ["UserID", "SessionLengthSum", "LevelProgressionSum"])
    i = 0
    for UserID, UserData in userDf.items():
        LastTaskSum = len(UserData.loc[:, "LastTaskCompleted"].unique())
        SessionLengthSum = UserData.loc[:, "CurrentSessionLength"].sum()
        LevelProgressionSum = UserData.loc[:, "LevelProgressionAmount"].sum()
        data = {"UserID": UserID, "SessionLengthSum" : SessionLengthSum, "LevelProgressionSum" : LevelProgressionSum, "LastTaskSum" : LastTaskSum}
        data = pd.DataFrame(data, index = [i])
        sumDf = pd.concat([sumDf, data])
        i += 1
    cols = ["SessionLengthSum", "LevelProgressionSum", "LastTaskSum"] 
    standardDf = standard.fit_transform(sumDf[cols])
    standardDf["UserID"] = sumDf["UserID"]
    return standardDf

In [9]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def trainCluster(df):
    clusterDf = df.drop(columns = "UserID")
    kmeans = KMeans(n_clusters=5, random_state=0, n_init="auto").fit(clusterDf)
    return kmeans

In [10]:
def predictCluster(df, kmeans):
    clusterDf = df.drop(columns = "UserID")
    clusterResults = kmeans.predict(clusterDf)
    df["Clusters"] = clusterResults
    return df.drop(columns = ["SessionLengthSum", "LevelProgressionSum"])

# Baseline model
This is a model trained without clustering. Criterion squared error instead of absolute because it is really slow if you use absolute.

In [12]:
FEATURES = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "ResponseValue", "LastTaskCompleted"]
standard = StandardScaler().set_output(transform = "pandas")
X1 = train_data[FEATURES]
X2 = numericalTime(X1)
X3 = fillLPA(X2)
clusteringDf = sumDf(X3)
kmeans = trainCluster(clusteringDf)
clusterResults = predictCluster(clusteringDf, kmeans)

X0 = X3.merge(clusterResults)

found 0 physical cores < 1
  File "C:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

FEATURES = ["CurrentSessionLength", "LevelProgressionAmount", "TimeUtcY"]
TARGET = ["ResponseValue"]
tree0 = DecisionTreeRegressor(criterion = "squared_error", max_depth = 5)

dataSize = int(len(X3))
trainSize = dataSize // 1.3
X, y = X3[FEATURES], X3[TARGET]
X_train, y_train, X_test, y_test = X.loc[:trainSize], y.loc[:trainSize], X.loc[trainSize:], y.loc[trainSize:]

tree0 = tree0.fit(X_train, y_train)
y_pred = tree0.predict(X_test)
treeMae = mean_absolute_error(y_test, y_pred)
print(f"Tree MAE {treeMae}")

linReg = LinearRegression().fit(X_train, y_train)
linPred = linReg.predict(X_test)
linMae = mean_absolute_error(y_test, linPred)
print(f"Linear Regression MAE {linMae}")

Tree MAE 174.8305928344616
Linear Regression MAE 176.3563302024848


# Clustered Model
In this model, the data was clustered before fitting. The clusters were relatively compact as seen from the sil scores and except for cluster 2, all had decent sample sizes. However, clustering did not significantly improve the model.

In [15]:
sildf = clusteringDf.drop(columns = "UserID")
print(silhouette_score(sildf, kmeans.fit_predict(sildf)))
display(X0["Clusters"].value_counts())

0.749566018746791


Clusters
0    47598
4    43162
3    27290
1    14152
2     2218
Name: count, dtype: int64

In [16]:
FEATURES = ["CurrentSessionLength", "LevelProgressionAmount", "TimeUtcY"]
TARGET = ["ResponseValue"]
N_CLUSTER = X0["Clusters"].max()


tree = DecisionTreeRegressor(criterion = "squared_error", max_depth = 5)

def modelSelection(df, model, FEATURES, target):
    df = df.drop(columns = "UserID")
    models = {}
    preds = []
    tests = []
    predsList = []
    testList = []
    for i in df["Clusters"].unique():
        data = df[df["Clusters"] == i]
        data = data.reset_index()
        dataSize = int(len(data))
        trainSize = int(dataSize // 1.3)
        X, y = data[FEATURES], data[TARGET]
        X_train, y_train, X_test, y_test = X.loc[:trainSize], y.loc[:trainSize], X.loc[trainSize:], y.loc[trainSize:]

        model = model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        tests = y_test
        mae = mean_absolute_error(tests, preds)
        models[i] = model, mae
        predsList.extend(model.predict(X_test))
        testList.extend(y_test["ResponseValue"])
    generalMae = mean_absolute_error(testList, predsList)
    print(f"MAE of the entire model = {generalMae}")
    return models

In [17]:
tree = modelSelection(X0, tree, FEATURES, TARGET)
tree

MAE of the entire model = 175.33296208904804


{0: (DecisionTreeRegressor(max_depth=5), 173.36351299027623),
 3: (DecisionTreeRegressor(max_depth=5), 179.04243828318056),
 4: (DecisionTreeRegressor(max_depth=5), 172.85891667129206),
 1: (DecisionTreeRegressor(max_depth=5), 193.79059940632493),
 2: (DecisionTreeRegressor(max_depth=5), 102.35139999412988)}

# Grouped Model
Grouping the users is a better way for going forward

In [19]:
SELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "ResponseValue"]
FEATURES =  ["TimeUtcY", "CurrentSessionLength", "LevelProgressionAmount"]
TARGET = ["ResponseValue"]
G = pd.read_csv("data/train_data.csv")
G1 = G[SELECT]
G2 = numericalTime(G1)
G0 = fillLPA(G2)

S = pd.rad_csv("data/supplimental_data.csv")
S1 = S[SELECT]
S2 = numericalTime(S2)
S0 = fillLPA(S0)

In [20]:
SELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "ResponseValue"]
FEATURES =  ["TimeUtcY", "CurrentSessionLength", "LevelProgressionAmount"]
grouped = G0.groupby("UserID")
model = {}
predsList = []
testList = []
tree = DecisionTreeRegressor(criterion = "squared_error", max_depth = 5)


for UserID, data in grouped:
    data = data.reset_index()
    dataSize = int(len(data))
    trainSize = int(dataSize * 0.8)

    X, y = data[FEATURES], data[TARGET]
    X_train, y_train, X_test, y_test = X.loc[:trainSize], y.loc[:trainSize], X.loc[trainSize:], y.loc[trainSize:]
    tree = tree.fit(X_train, y_train)
    preds = tree.predict(X_test)
    tests = y_test
    
    mae = mean_absolute_error(tests, preds)
    model[UserID] = tree

    predsList.extend(tree.predict(X_test))
    testList.extend(y_test["ResponseValue"])
    
mean_absolute_error(testList, predsList)

79.83109975408807

# PREDICTION FUNCTION

In [22]:
SELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount"]
FEATURES =  ["TimeUtcY", "CurrentSessionLength", "LevelProgressionAmount"]
y = pd.read_csv("data/test_data.csv")
y1 = y[SELECT]
y2 = numericalTime(y1)
y3 = fillLPA(y2)

In [96]:
scaleSessionLength(y3)

TypeError: TransformerMixin.fit_transform() missing 1 required positional argument: 'X'

In [61]:
yGrouped = y3.groupby("UserID")
ypreds = []


for UserID, data in yGrouped:
    data = data.reset_index()
    data = data[FEATURES]
    if UserID in model:
        userPredictions = model[UserID].predict(data)
    else:
        dataSize = len(data)
        userPredictions = dataSize * [750]
    ypreds.extend(userPredictions)

In [71]:
np.savetxt("predicted.csv", ypreds)