In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
from fastai.tabular.all import *
from sklearn.model_selection import train_test_split#, StratifiedKFold

In [8]:
import warnings; 
warnings.simplefilter('ignore')

In [9]:
from sklearn.metrics import f1_score, mean_squared_error, classification_report, confusion_matrix, \
    ConfusionMatrixDisplay
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def eval_clf(y_test, y_pred):
    if isinstance(y_test, (pd.core.frame.DataFrame, pd.core.series.Series)):
        y_test = y_test.astype(str)
    if isinstance(y_pred, (pd.core.frame.DataFrame, pd.core.series.Series)):
        y_pred = y_pred.astype(str)
    clf_report = classification_report(y_test,
                                       y_pred, )

    print(clf_report)

    test_labels = set(np.unique(y_test))
    pred_labels = set(np.unique(y_pred))
    labels = test_labels.union(pred_labels)

    conf_matrix = confusion_matrix(y_test,
                                   y_pred)
    disp = ConfusionMatrixDisplay(conf_matrix, display_labels=labels)
    fig, ax = plt.subplots(figsize=(14, 6))
    ax.grid(False)
    disp.plot(ax=ax)

    return round(f1_score(y_test, y_pred, average='micro'), 2)


def half_round(inp):
    """Round a number to the closest half integer.
    >>> round_off_rating(1.3)
    1.5
    >>> round_off_rating(2.6)
    2.5
    >>> round_off_rating(3.0)
    3.0
    >>> round_off_rating(4.1)
    4.0"""
    if isinstance(inp, (pd.core.frame.DataFrame, pd.core.series.Series, int)):
        return round(inp * 2) / 2
    else:
        print('type is not half rounded', type(inp), inp)


def eval_regr(y_test, y_pred):
    raw_rmse = round(mean_squared_error(y_test, y_pred, squared=False), 2)

    y_test = half_round(y_test)
    y_pred = half_round(y_pred)
    rounded_rmse = round(mean_squared_error(y_test, y_pred, squared=False), 2)

    if rounded_rmse == raw_rmse:
        print('probably, already rounded values were passed')

    return rounded_rmse, raw_rmse


def eval_all(y_test, y_pred, model_name: str):
    f1 = eval_clf(half_round(y_test), half_round(y_pred))
    rounded_rmse, raw_rmse = eval_regr(y_test, y_pred)
    return {model_name: {'f1': f1, 'rounded_rmse': rounded_rmse, 'rmse': raw_rmse}}



In [10]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
seed_everything(1234)

In [11]:
root = "../input/mood-prediciton"
X_train = pd.read_csv(root+'/X_train.csv', index_col=0)
#X_train_minmax_scaled = pd.read_csv(root+'/X_train_minmax_scaled.csv', index_col=0)
X_test = pd.read_csv(root+'/X_test.csv', index_col=0)
#X_test_minmax_scaled = pd.read_csv(root+'/X_test_minmax_scaled.csv', index_col=0)
y_train = pd.read_csv(root+'/y_train.csv', index_col=0)
y_test = pd.read_csv(root+'/y_test.csv', index_col=0)


In [12]:
df_train = pd.concat([X_train,y_train],axis=1).rename(columns={'0':'mood_next'})
df_test = pd.concat([X_test,y_test],axis=1).rename(columns={'0':'mood_next'})

In [52]:
cat_names = ['id', 'is_weekend']
cont_names = ['mood', 'circumplex.arousal', 'circumplex.valence', 'activity',
       'appCat.builtin', 'appCat.communication', 'appCat.entertainment',
       'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other',
       'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities',
       'appCat.weather', 'call', 'screen', 'sms', 'asleep', 'active',
       'missing_hour', 'mood_change', 'first_active', 'last_active',
       'active_hours_night', 'first_active_change', 'last_active_change',
       'prev_mood', 'sin(1,freq=W-SUN)', 'cos(1,freq=W-SUN)',
       'sin(2,freq=W-SUN)', 'cos(2,freq=W-SUN)', 'sin(3,freq=W-SUN)',
       'cos(3,freq=W-SUN)', 'day', 'dayofweek', 'dayofyear',
       'quarter']

dep_var = 'mood_next'

splits = EndSplitter(valid_last=True, valid_pct=0.3)(range_of(df_train))[1]
tc = tabular_config(embed_p=0.05, y_range=torch.tensor([4,10]), ps=0.5)

procs = [Categorify, FillMissing, Normalize]
dls = TabularDataLoaders.from_df(df_train, root, procs, cat_names, cont_names, dep_var, 
                                 valid_idx=splits, bs=256, log=True, y_block=RegressionBlock())
learn = tabular_learner(dls, layers=[1000, 500, 250], metrics=rmse, config=tc, loss_func=MSELossFlat())

In [53]:
#learn.summary()

In [54]:
#dls.show_batch()

In [55]:
%%capture
learn.fit_one_cycle(25)

In [56]:
dl = learn.dls.test_dl(df_test, bs=256)
pred, _  = learn.get_preds(dl=dl)
y_pred = pd.DataFrame(pred)

In [57]:
eval_all(y_test, y_pred, model_name='Fastai tabular')

In [58]:
# align index of test_pred with index of y_test
test_pred = pd.concat([pd.DataFrame(y_test.index, columns=['index']), y_pred], axis=1).set_index('index')

for idx in X_train['id'].unique():
    fig,ax = plt.subplots(figsize=(16,4))
    ax.plot(y_train.loc[X_train[X_train['id'] == idx].index], label='train', c='b')
    ax.plot(y_test.loc[X_test[X_test['id'] == idx].index], label='test', c='g')
    ax.plot(test_pred.loc[X_test[X_test['id'] == idx].index], label='pred', c='r')
    ax.set_ylabel('Mood')
    ax.set_xlabel('Day')
    plt.legend()
    plt.title(idx)

In [59]:
current_series_y = []
current_series_index = []
fig,ax = plt.subplots(figsize=(70,8))

i_prev = y_train.reset_index().iloc[0,:]['index']
for _,row in y_train.reset_index().iterrows():
    if row['index'] != i_prev + 1:
        ax.plot(current_series_index, current_series_y, c='b')
        current_series_y = []
        current_series_index = []
    else: 
        current_series_y.append(row['0'])
        current_series_index.append(row['index'])
    i_prev = row['index']

current_series_y = []
current_series_index = []
i_prev = y_test.reset_index().iloc[0,:]['index']
for _,row in y_test.reset_index().iterrows():
    if row['index'] != i_prev + 1:
        ax.plot(current_series_index, current_series_y, c='g')
        current_series_y = []
        current_series_index = []
    else: 
        current_series_y.append(row['0'])
        current_series_index.append(row['index'])
    i_prev = row['index']

current_series_y = []
current_series_index = []
y_pred_rescaled = pd.concat([pd.DataFrame(y_test.index, columns=['index']), y_pred], axis=1).set_index('index')
i_prev = y_pred_rescaled.reset_index().iloc[0,:]['index']
for _,row in y_pred_rescaled.reset_index().iterrows():
    if row['index'] != i_prev + 1:
        ax.plot(current_series_index, current_series_y, c='r')
        current_series_y = []
        current_series_index = []
    else: 
        current_series_y.append(row[0])
        current_series_index.append(row['index'])
    i_prev = row['index']

l_train, = plt.plot([1], c='b')
l_test, = plt.plot([1], c='g')
l_pred, = plt.plot([1], c='r')



plt.legend([l_train, l_test, l_pred],['train', 'test', 'pred']);

In [21]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(50,6))
ax.plot(y_pred, label='prediction')
ax.plot(y_test, label='true label')
plt.legend()