In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
infos = pd.read_excel("../data/t2_info.xlsx")
t2_train_c = pd.read_csv("../data/t2_train.csv")

t2_train_c = t2_train_c.rename(str, {"user_ID": "id"})
t2_train_c = t2_train_c.merge(infos, on="id")

In [None]:
t2_train_c["m_id"] = [str(t2_train_c["id"][i]) + "_" + 
                      str(t2_train_c['period'][i]) for i in range(len(t2_train_c["id"]))]

In [None]:
sw_list = ["sweat_10", "sweat_11", "sweat_12", "sweat_13", "sweat_14", "sweat_15", "sweat_16",
             "sweat_r0", "sweat_r1", "sweat_r2", "sweat_r3", "sweat_r4", "sweat_r5", "sweat_r6"]

In [None]:
t2_train_c.columns

In [None]:
for w in range(1, 13):
    print("Current window: {}".format(w))
    t2_train_c["m_mean_w_{}".format(w)] = t2_train_c.groupby("m_id")["motion"].apply(pd.rolling_mean, w)
    t2_train_c["m_mean_w_{}".format(w)] = t2_train_c["m_mean_w_{}".format(w)].fillna(0)
    
    t2_train_c["m_std_w_{}".format(w)] = t2_train_c.groupby("m_id")["motion"].apply(pd.rolling_std, w)
    t2_train_c["m_std_w_{}".format(w)] = t2_train_c["m_std_w_{}".format(w)].fillna(0)
    
    t2_train_c["m_q10_w_{}".format(w)] = t2_train_c.groupby("m_id")["motion"].apply(pd.rolling_quantile, w, quantile=0.1)
    t2_train_c["m_q10_w_{}".format(w)] = t2_train_c["m_q10_w_{}".format(w)].fillna(0)
    
    t2_train_c["m_q90_w_{}".format(w)] = t2_train_c.groupby("m_id")["motion"].apply(pd.rolling_quantile, w, quantile=0.9)
    t2_train_c["m_q90_w_{}".format(w)] = t2_train_c["m_q90_w_{}".format(w)].fillna(0)
    
    t2_train_c["t_mean_w_{}".format(w)] = t2_train_c.groupby("m_id")["temperature"].apply(pd.rolling_mean, w)
    t2_train_c["t_mean_w_{}".format(w)] = t2_train_c["t_mean_w_{}".format(w)].fillna(0)
    
    t2_train_c["t_std_w_{}".format(w)] = t2_train_c.groupby("m_id")["temperature"].apply(pd.rolling_std, w)
    t2_train_c["t_std_w_{}".format(w)] = t2_train_c["t_std_w_{}".format(w)].fillna(0)
    
    t2_train_c["t_q10_w_{}".format(w)] = t2_train_c.groupby("m_id")["temperature"].apply(pd.rolling_quantile, w, quantile=0.1)
    t2_train_c["t_q10_w_{}".format(w)] = t2_train_c["t_q10_w_{}".format(w)].fillna(0)
    
    t2_train_c["t_q90_w_{}".format(w)] = t2_train_c.groupby("m_id")["temperature"].apply(pd.rolling_quantile, w, quantile=0.9)
    t2_train_c["t_q90_w_{}".format(w)] = t2_train_c["t_q90_w_{}".format(w)].fillna(0)

In [None]:
col_list = []
for w in range(1, 13):
    col_list.append("m_mean_w_{}".format(w))
    col_list.append("m_std_w_{}".format(w))
    col_list.append("m_q10_w_{}".format(w))
    col_list.append("m_q90_w_{}".format(w))
    col_list.append("t_mean_w_{}".format(w))
    col_list.append("t_std_w_{}".format(w))
    col_list.append("t_q10_w_{}".format(w))
    col_list.append("t_q90_w_{}".format(w))
    
col_list += sw_list
col_list += ["m_id", "study_x", "sex", "age", "self_size", "shirt_size", "deodorant_left", "deodorant_right"]

In [None]:
train_rolling = t2_train_c[col_list]
train_rolling = train_rolling.drop("m_std_w_1", axis=1)
train_rolling = train_rolling.drop("t_std_w_1", axis=1)

In [None]:
# Find missing size values and replace nans (for now)
self_size = np.array(train_rolling.self_size.fillna("NA"))
shirt_size = np.array(train_rolling.shirt_size.fillna("NA"))

for i in range(len(self_size)):
    if self_size[i] == "S oder M":
        self_size[i] = "S/M"
    if self_size[i] == "NA" and shirt_size[i] != "NA":
        self_size[i] = shirt_size[i]
    if shirt_size[i] == "NA" and self_size[i] != "NA":
        shirt_size[i] = self_size[i]

In [None]:
# Put nans back
train_rolling.self_size = self_size
train_rolling.shirt_size = shirt_size

In [None]:
train_rolling.to_csv("train_rolling.csv")

In [None]:
train_rolling = pd.read_csv("train_rolling.csv")

In [None]:
train_rolling.self_size = train_rolling.self_size.fillna("NA")
train_rolling.shirt_size = train_rolling.shirt_size.fillna("NA")

In [None]:
train_rolling = pd.concat([train_rolling, pd.get_dummies(train_rolling.self_size, prefix="self")], join="inner", axis=1)
train_rolling = pd.concat([train_rolling, pd.get_dummies(train_rolling.shirt_size, prefix="shirt")], join="inner", axis=1)

train_rolling = train_rolling.drop("self_size", axis=1)
train_rolling = train_rolling.drop("shirt_size", axis=1)

In [None]:
train_rolling = pd.concat([train_rolling, pd.get_dummies(train_rolling.sex)], join="inner", axis=1)

train_rolling = train_rolling.drop("sex", axis=1)

In [None]:
train_rolling = pd.concat([train_rolling, pd.get_dummies(train_rolling.deodorant_left, prefix="d_l")], join="inner", axis=1)
train_rolling = pd.concat([train_rolling, pd.get_dummies(train_rolling.deodorant_right, prefix="d_r")], join="inner", axis=1)
train_rolling = pd.concat([train_rolling, pd.get_dummies(train_rolling.study_x, prefix="study")], join="inner", axis=1)

train_rolling = train_rolling.drop("study_x", axis=1)
train_rolling = train_rolling.drop("deodorant_left", axis=1)
train_rolling = train_rolling.drop("deodorant_right", axis=1)

In [None]:
train_rolling.drop("Unnamed: 0", axis=1)
train_rolling.to_csv("../data/train_ohe.csv")

 # Start here

In [2]:
from xgboost import XGBRegressor

train_rolling = pd.read_csv("../data/train_ohe.csv")

np.random.seed(42)
un = np.unique(train_rolling.m_id)
np.random.shuffle(un)
id_val, id_train = un[:140], un[140:]



In [3]:
train_rolling = train_rolling.drop("Unnamed: 0", axis=1)
train_rolling = train_rolling.drop("Unnamed: 0.1", axis=1)

In [4]:
df_train = train_rolling.loc[train_rolling['m_id'].isin(id_train)]
df_val = train_rolling.loc[train_rolling['m_id'].isin(id_val)]

In [5]:
sw_list = ["sweat_10", "sweat_11", "sweat_12", "sweat_13", "sweat_14", "sweat_15", "sweat_16",
             "sweat_r0", "sweat_r1", "sweat_r2", "sweat_r3", "sweat_r4", "sweat_r5", "sweat_r6"]

y_train = df_train[sw_list]
y_val = df_val[sw_list]

X_train = df_train.drop(sw_list + ["m_id"], axis=1)
X_val = df_val.drop(sw_list + ["m_id"], axis=1)

In [6]:
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn.metrics import mean_squared_error

In [None]:
import sys
#sys.stdout = open('logs.txt', 'w')

y_pred = y_val.copy()

params = {
    "max_depth": [3, 6, 9],
    "min_child_weight": [1, 3, 5],
    #"colsample_bytree": [0.5, 0.75, 1],
    #"gamma": [0, 0.3, 0.5]
}

mses = np.zeros((len(sw_list), 9))
zero_mses = np.zeros((len(sw_list), 9))
params = np.zeros((len(sw_list), 9, 3))
sw_ind = 0

for sw_type in sw_list:
    print("Processing {}".format(sw_type))
    ind = 0
    for max_depth in [3, 6, 9]:
        print("Depth: {}".format(max_depth))
        xgb = XGBRegressor(max_depth=max_depth)
        t = time()
        xgb.fit(X_train, y_train[sw_type])
        y_pred[sw_type] = xgb.predict(X_val)
        print("Time spent: {}".format(time() - t))
        print("Val MSE: {}".format(mean_squared_error(y_val[sw_type], y_pred[sw_type])))
        print("No zero val MSE: {}".format(mean_squared_error(y_val[sw_type],
                                                              (y_pred[sw_type]+np.abs(y_pred[sw_type]))/2)))
        print("-"*25)
        mses[sw_ind, ind] = mean_squared_error(y_val[sw_type], y_pred[sw_type])
        zero_mses[sw_ind, ind] = mean_squared_error(y_val[sw_type],
                                            (y_pred[sw_type]+np.abs(y_pred[sw_type]))/2)
        params[sw_ind, ind, 0] = max_depth
        params[sw_ind, ind, 1] = min_child_weight
        params[sw_ind, ind, 2] = colsample_bytree
        ind += 1
    sw_ind += 1

Processing sweat_10
Depth: 3


In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_val.values.flatten(), (y_pred.values.flatten()+((y_pred + np.abs(y_pred))/2).values.flatten()))

In [None]:
mean_squared_error(y_val, y_pred)

In [None]:
gs.best_params_

In [None]:
mean_squared_error(y_val, (y_pred+np.abs(y_pred))/2)

In [None]:
plt.figure(figsize = (8,8))

plt.hist(y_pred.values.flatten(), label="Predicted", alpha=0.8)
plt.hist(y_val.values.flatten(), label="Validation", alpha=0.8)
plt.legend()