# CLUSTER / LEARN

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def loadFile(googleTrue, file = None ):
    if googleTrue == True:
        from google.colab import drive
        drive.mount('/content/drive')
        x = pd.read_csv("drive/MyDrive/train_data.csv")
    else:
        x = pd.read_csv(file)
    return x

train_data = loadFile(False, "data/train_data.csv")
test_data = loadFile(False, "data/test_data.csv")
supp_data = loadFile(False, "data/supplimental_data.csv")

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import TimeSeriesSplit

# Functions
Time converted to numerical because most models do not support datetime. Missing values in levelprogression were filled with 0 because most NA's also had sessionlength 0. sumDf creates summary statistics for each user which are used to train the kmeans clustering
algorithm. In previous iterations, currentgamemode was converted to a binary with career being 1 and all other values 0, due to class imbalance that was the best. QuestionTiming was also converted to binary but these did not help much either. 

In [5]:
def fillLPA(df):
    df.loc[:, "LevelProgressionAmount"] = df.loc[:, "LevelProgressionAmount"].apply(lambda x: 0 if pd.isna(x) else x)
    return df

In [6]:
def numericalTime(df):
    TimeUtcDate = pd.to_datetime(df.loc[:, "TimeUtc"])
    TimeUtcNumeric = pd.to_numeric(TimeUtcDate)
    df.loc[:, "TimeUtc"] = TimeUtcNumeric
    return df

In [7]:
def binaryQuestion(df):
    df.loc[:, "QuestionType"] = df.loc[:, "QuestionType"].astype(str)
    df.loc[:, "QuestionType"] = df.loc[:, "QuestionType"].apply(lambda x: 1 if x.strip() == "Wellbeing" else 0)
    return df

In [8]:
def scaleVariables(df):
    scaler = StandardScaler()
    grouped = df.groupby("UserID")
    result = pd.DataFrame()
    for UserID, data in grouped:
        before = pd.DataFrame(data[FEATURES])
        after = scaler.fit_transform(X = before)
        afterDf = pd.DataFrame(after, index=data.index, columns = FEATURES)
        result = pd.concat([result, afterDf])
    result = result.sort_index()
    try:
        df1 = df[["UserID", "QuestionType", "ResponseValue"]]
    except:
        df1 = df[["UserID", "QuestionType"]]
    df2 = pd.concat([df1, result], axis = 1)
    return df2

# Grouped Model
Grouping the users is a better way for going forward

In [10]:
TESTSELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "QuestionType"]

SELECT = ["UserID", "TimeUtc", "CurrentSessionLength", "LevelProgressionAmount", "QuestionType", "ResponseValue"]
FEATURES =  ["TimeUtc", "CurrentSessionLength", "LevelProgressionAmount"]

TARGET = ["ResponseValue"]

In [12]:
S = supp_data("data/supplimental_data.csv")
GS = pd.concat([G, S])
GS = GS.reset_index()
GS = GS.sort_values(by = ["UserID"])
GS1 = GS[SELECT]
GS2 = fillLPA(GS1)
GS3 = numericalTime(GS2)
GS4 = binaryQuestion(GS3)
GS0 = scaleVariables(GS4)

KeyboardInterrupt: 

In [None]:
pGrid = {
    "max_depth": [3, 5],
    "max_features": [2, 10],
    "min_samples_split": [10, 20]
}

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfcvList = {}
modelCV = {}
grouped = GS0.groupby("UserID")
notIncluded = {}


for UserID, data in grouped:
    data = data.reset_index()
    dataSize = len(data)
    testSize = int(dataSize * 0.7)
    if testSize < 3:
        continue
    train, test = data.iloc[:testSize], data.iloc[testSize:]
    X, y = train[FEATURES], np.ravel(train[TARGET])
    X_test, y_test = test[FEATURES], np.ravel(test[TARGET])
    
    tscv = TimeSeriesSplit(n_splits = 2)
    rf = RandomForestRegressor(criterion = "squared_error")
    rfcv = GridSearchCV(estimator = rf,
                        cv = tscv,
                        param_grid = pGrid,
                        scoring='neg_mean_absolute_error',
                        n_jobs =-1)
    rfcv = rfcv.fit(X, y)
    bestModel = rfcv.best_estimator_
    bestParams = rfcv.best_score_
    testPredict = bestModel.predict(X_test)
    testTrue = y_test
    testMAE = mean_absolute_error(testTrue, testPredict)
    rfcvList[UserID] = rfcv
    modelCV[UserID] = bestModel

In [None]:
grouped = GS0.groupby("UserID")
finalModel = {}
for UserID, data in grouped: 
    data = data.reset_index()
    X, y = data[FEATURES], np.ravel(data[TARGET])
    if UserID in modelCV:
        rf = modelCV[UserID]
    else:
        rf = RandomForestRegressor(max_depth = 5, max_features = 10, min_samples_split=10)
    rf.fit(X, y)
    finalModel[UserID] = rf
    

In [None]:
X = G0[FEATURES]
y = np.ravel(G0[TARGET])
rf1 = RandomForestRegressor(max_depth = 5, max_features = 10, min_samples_split=10)
rf1.fit(X,y)
finalModel["NoID"] = rf1

# PREDICTION FUNCTION

In [None]:
y = pd.read_csv("data/test_data.csv")
y1 = y[TESTSELECT]
y2 = fillLPA(y1)
y3 = numericalTime(y2)
y4 = binaryQuestion(y3)
y0 = scaleVariables(y4)
y0

In [None]:
yGrouped = y0.groupby("UserID")
ypreds = []

for UserID, data in yGrouped:
    data = data.reset_index()
    data = data[FEATURES]
    if UserID in finalModel:
        userPredictions = finalModel[UserID].predict(data)
    else:
        userPredictions = finalModel["NoID"].predict(data)
    ypreds.extend(userPredictions)

In [None]:
np.savetxt("predicted.csv", ypreds)