# CLUSTER / LEARN

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
#warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
#warnings.simplefilter(action='ignore', category=FutureWarning)

def loadFile(googleTrue, file = None ):
    if googleTrue == True:
        from google.colab import drive
        drive.mount('/content/drive')
        x = pd.read_csv("drive/MyDrive/train_data.csv")
    else:
        x = pd.read_csv(file)
    return x

train_data = loadFile(False, "data/train_data.csv")
test_data = loadFile(False, "data/test_data.csv")
supp_data = loadFile(False, "data/supplimental_data.csv")

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import TimeSeriesSplit

# Functions
Time converted to numerical because most models do not support datetime. Missing values in levelprogression were filled with 0 because most NA's also had sessionlength 0. sumDf creates summary statistics for each user which are used to train the kmeans clustering
algorithm. In previous iterations, currentgamemode was converted to a binary with career being 1 and all other values 0, due to class imbalance that was the best. QuestionTiming was also converted to binary but these did not help much either. 

In [5]:
def fillLPA(df):
    df.loc[:, "LevelProgressionAmount"] = df.loc[:, "LevelProgressionAmount"].apply(lambda x: 0 if pd.isna(x) else x)
    return df

In [6]:
def numericalTime(df):
    TimeUtcDate = pd.to_datetime(df.loc[:, "TimeUtc"])
    TimeUtcNumeric = pd.to_numeric(TimeUtcDate)
    df.loc[:, "TimeUtc"] = TimeUtcNumeric
    return df

In [7]:
def binaryQuestion(df):
    df.loc[:, "QuestionType"] = df.loc[:, "QuestionType"].astype(str)
    df.loc[:, "QuestionType"] = df.loc[:, "QuestionType"].apply(lambda x: 1 if x.strip() == "Wellbeing" else 0)
    return df

In [8]:
def scaleVariables(df):
    scaler = StandardScaler()
    grouped = df.groupby("UserID")
    result = pd.DataFrame()
    for UserID, data in grouped:
        before = pd.DataFrame(data[FEATURES])
        after = scaler.fit_transform(X = before)
        afterDf = pd.DataFrame(after, index=data.index, columns = FEATURES)
        result = pd.concat([result, afterDf])
    result = result.sort_index()
    try:
        df1 = df[["TimeUtc", "UserID", "QuestionType", "ResponseValue"]]
    except:
        df1 = df[["TimeUtc", "UserID", "QuestionType"]]
    df2 = pd.concat([df1, result], axis = 1)
    return df2

In [9]:
def indexTime(df):
    df.index = pd.to_datetime(df["TimeUtc"], format = "%Y-%m-%d %H:%M:%S")
    df = df.drop(columns = ["TimeUtc"])
    return df

# Grouped Model
Grouping the users is a better way for going forward

In [11]:
G = pd.read_csv("data/train_data.csv")
S = pd.read_csv("data/supplimental_data.csv")

In [12]:
TESTSELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "QuestionType"]
SELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "QuestionType", "ResponseValue"]
FEATURES =  ["CurrentSessionLength", "LevelProgressionAmount"]
TARGET = ["ResponseValue"]

In [13]:
GS = pd.concat([G, S])
GS = GS.reset_index()
GS = GS.sort_values(by = ["UserID", "TimeUtc"])
GS = GS[SELECT]

In [14]:
GS = fillLPA(GS)
GS = binaryQuestion(GS)
GS = scaleVariables(GS)
GS = indexTime(GS)
GS

Unnamed: 0_level_0,UserID,QuestionType,ResponseValue,CurrentSessionLength,LevelProgressionAmount
TimeUtc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-08-18 22:55:27,p1,1,509.0,-1.089177,-1.430427
2022-08-18 23:38:31,p1,1,653.0,-1.248658,-1.430427
2022-08-18 23:39:24,p1,1,705.0,-1.168918,1.507010
2022-08-18 23:45:01,p1,1,817.0,-0.770217,-0.936153
2022-08-18 23:51:22,p1,1,810.0,-0.212036,-0.169196
...,...,...,...,...,...
2022-10-12 04:53:50,p9999,0,961.0,0.098333,1.305168
2022-10-12 05:01:29,p9999,1,948.0,0.228961,1.408868
2022-10-12 05:09:22,p9999,1,881.0,0.359588,1.501254
2022-10-12 05:18:11,p9999,0,924.0,0.506544,-1.390615


In [15]:
GS1 = GS[:100]
usercounts = GS1["UserID"].value_counts()
userkeep = usercounts[usercounts >= 5].index
GS1 = GS1[GS1["UserID"].isin(userkeep)]

In [16]:
df = GS1[GS1["UserID"] == "p1"]
df = df.drop(columns = "UserID")


In [33]:
df["QuestionType"] = df["QuestionType"].astype(float)

SOURCE: https://www.youtube.com/watch?v=c0k-YLQGKjY&ab_channel=GregHogg

In [41]:
import tensorflow as ts
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.losses import MeanAbsoluteError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

In [20]:
from sklearn.metrics import mean_squared_error as mse

def plot_predictions1(model, X, y, start = 0, end = 100):
    predictions = model.predict(X).flatten()
    df = pd.DataFrame(data = {"Predictions": predictions, "Actualy":y})
    return df, mse(y, predictions)

In [21]:
def lagXy(df, window_size = 6):
    df_as_np = df.to_numpy()
    X = []
    y = []
    for i in range(len(df_as_np) - window_size):
        row = [r for r in df_as_np[i:i+window_size]]
        X.append(row)
        label = df_as_np[i+window_size][2]
        y.append(label)
    return np.array(X), np.array(y)

In [22]:
X2, y2, = lagXy(df)

X2.shape, y2.shape

((38, 6, 4), (38,))

In [23]:
X2_train, y2_train = X2[:20], y2[:20]
X2_val, y2_val = X2[20:25], y2[20:25]
X2_test, y2_test = X2[25:], y2[25:]
X2_train.shape, y2_train.shape, X2_val.shape, y2_val.shape, X2_test.shape, y2_test.shape

((20, 6, 4), (20,), (5, 6, 4), (5,), (13, 6, 4), (13,))

In [27]:
model4 = Sequential()
model4.add(InputLayer((6, 4)))
model4.add(LSTM(64))
model4.add(Dense(8, 'relu'))
model4.add(Dense(1, 'linear'))

model4.summary()


In [43]:
cp4 = ModelCheckpoint('model4.keras', save_best_only=True)
model4.compile(loss=MeanAbsoluteError(), optimizer=Adam(learning_rate=0.0001), metrics=[MeanAbsoluteError()])

In [45]:
model4.fit(X2_train, y2_train, validation_data=(X2_val, y2_val), epochs=10, callbacks=[cp4])

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 1.1295 - mean_absolute_error: 1.1295 - val_loss: 1.0908 - val_mean_absolute_error: 1.0908
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 1.1267 - mean_absolute_error: 1.1267 - val_loss: 1.0863 - val_mean_absolute_error: 1.0863
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 1.1237 - mean_absolute_error: 1.1237 - val_loss: 1.0814 - val_mean_absolute_error: 1.0814
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 1.1204 - mean_absolute_error: 1.1204 - val_loss: 1.0763 - val_mean_absolute_error: 1.0763
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 1.1170 - mean_absolute_error: 1.1170 - val_loss: 1.0708 - val_mean_absolute_error: 1.0708
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 1.1134

<keras.src.callbacks.history.History at 0x1b6fea33fd0>