In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from typing import Union


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [78]:
#load the data
df = pd.read_csv('Datasets/feature_engineered_data_classification.csv')
df.head()

Unnamed: 0,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,appCat.communication,appCat.entertainment,...,AS14.26,AS14.27,AS14.28,AS14.29,AS14.30,AS14.31,AS14.32,AS14.33,mood_target,date
0,0.666667,0.544964,0.790419,0.016944,2.005873e-18,7.412061000000001e-17,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-08
1,0.688889,0.920466,0.790419,0.147492,2.005873e-18,0.9577064,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-09
2,0.688889,0.083616,0.961064,0.016944,2.005873e-18,7.412061000000001e-17,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-09
3,0.703704,0.888636,0.375675,0.147492,2.005873e-18,7.412061000000001e-17,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-10
4,0.644444,0.083616,0.323746,0.016944,2.005873e-18,7.412061000000001e-17,0.9997626,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-10


In [79]:
test_cols = []

for col in df.columns:
    if col[:2] == 'AS':
        continue
    elif col[:3] == 'app':
        continue
    elif col[:3] == 'lat':
        continue
    elif col[-1] == '2' or col[-1] == '3' or col[-1] == '4' or col[-1] == '5' or col[-1] == '6' or col[-1] == '7':
        continue

    else:
        test_cols.append(col)


#late date be first column
test_cols = test_cols[:-1]
print(df.shape)
test_cols

(1296, 121)


['mood',
 'circumplex.arousal',
 'circumplex.valence',
 'activity',
 'screen',
 'call',
 'sms',
 'mood_prev_1',
 'mood_absolute_change',
 'mood_absolute_change_prev_1',
 'mood_pct_change',
 'mood_pct_change_prev_1',
 'circumplex.valence_prev_1',
 'circumplex.valence_absolute_change',
 'circumplex.valence_absolute_change_prev_1',
 'circumplex.arousal_prev_1',
 'circumplex.arousal_absolute_change',
 'circumplex.arousal_absolute_change_prev_1',
 'activity_prev_1',
 'activity_absolute_change',
 'activity_absolute_change_prev_1',
 'year',
 'month',
 'day_of_month',
 'day_of_week',
 'is_holiday',
 'is_weekend',
 'days_until_weekend',
 'mood_target']

In [80]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=10, max_train_size=200, test_size=None)
print(tscv)

#drop dates
df = df.drop(['date'], axis=1)

X = df.drop(['mood_target'], axis=1)
y = df['mood_target']

TimeSeriesSplit(gap=0, max_train_size=200, n_splits=10, test_size=None)


### Initialise

In [81]:
#maximum number of models to try (runtime)
max_models = 10

#set folds and equal trainingsize to testsize (balanced training)
n_splits = 5
t_size = len(y) // (n_splits + 1)

# Initialize time series cross-validation object
tscv = TimeSeriesSplit(n_splits=n_splits, max_train_size=t_size)

for train_index, test_index in tscv.split(X):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216


### Evaluation


In [87]:
def evaluation(true, pred):
    acc = accuracy_score(true, pred)
    f1= f1_score(true, pred, average='weighted')

    return {'accuracy': acc, 'f1': f1}

### Models

In [88]:
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

#set up models
models_collection = {
    'rfc' : RandomForestClassifier(class_weight='balanced_subsample')
    # 'xgb' : XGBClassifier(),
    # 'gnb' : GaussianNB(),
    # 'knn' : KNeighborsClassifier(),
    # 'svm' : svm.SVC(),
}

In [90]:
# Initialize empty list to store cross-validation results
cv_results = dict(list())

# Loop through each cross-validation fold and fit model on training data
for train_index, test_index in tscv.split(X):
    # Split data into training and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Loop over all models
    for model_name, model in models_collection.items():

        # Fit model on training data
        model.fit(X_train, y_train)

        # Predict target variable on test data
        y_pred = model.predict(X_test)

        # Compute f1-score on test data
        score = evaluation(y_test, y_pred)

        # Append cross-validation result to list
        if model_name in cv_results:
            cv_results[model_name].append(score)
        else:
            cv_results[model_name] = [score]


#loop over models and print results for each evaluation metric
for model_name in models_collection.keys():
    print(f'Model: {model_name}')
    print('  Results:', [round(res['f1'], 3) for res in cv_results[model_name]])
    print('  avg f1:', round(sum([res['f1'] for res in cv_results[model_name]]) / len(cv_results[model_name]), 3))
    print('  avg acc:', round(sum([res['accuracy'] for res in cv_results[model_name]]) / len(cv_results[model_name]), 3))

# Compute average f1 for all models over all cross-validation folds
# avg_f1 = {name : sum(cv_results_for_model[score]) / len(cv_results_for_model[score]) for name, cv_results_for_model in cv_results.items()}

# # Report results
# for model_name in models_collection.keys():
#     print(f'Model: {model_name}')
#     print('  Results:', [round(res, 3) for res in cv_results[model_name]])
#     print('  avg f1:', round(avg_f1[model_name], 3))


Model: rfc
  Results: [0.364, 0.41, 0.327, 0.314, 0.392]
  avg f1: 0.361
  avg acc: 0.494


In [85]:
#best model
best_res = 0.0
best_model = ''
for name, res in avg_f1.items():
    if res > best_res:
        best_res = res
        best_model = models_collection[name]
best_model, best_res

(RandomForestClassifier(class_weight='balanced_subsample'),
 0.49444444444444446)