In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from typing import Union

from pprint import pprint


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [2]:
#load the data
df = pd.read_csv('Datasets/feature_engineered_data_classification.csv')
# df = pd.read_csv('Datasets/df_good_features_classification.csv')
df.head()

Unnamed: 0,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,appCat.communication,appCat.entertainment,...,AS14.26,AS14.27,AS14.28,AS14.29,AS14.30,AS14.31,AS14.32,AS14.33,mood_target,date
0,0.666667,0.544964,0.790419,0.016944,2.005873e-18,7.412061000000001e-17,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-08
1,0.688889,0.920466,0.790419,0.147492,2.005873e-18,0.9577064,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-09
2,0.688889,0.083616,0.961064,0.016944,2.005873e-18,7.412061000000001e-17,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-09
3,0.703704,0.888636,0.375675,0.147492,2.005873e-18,7.412061000000001e-17,8.971913e-09,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-10
4,0.644444,0.083616,0.323746,0.016944,2.005873e-18,7.412061000000001e-17,0.9997626,0.042151,1.301351e-14,5.3e-05,...,0,0,0,0,0,0,0,0,7.0,2014-03-10


In [3]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=10, max_train_size=200, test_size=None)
print(tscv)

#drop dates
try:
    df = df.drop(['date'], axis=1)
except:
    pass

X = df.drop(['mood_target'], axis=1)
used_columns = X.columns

y = df['mood_target']

TimeSeriesSplit(gap=0, max_train_size=200, n_splits=10, test_size=None)


In [4]:
#make column names pretty
def prettify_column_names(used_columns):
    used_columns_list = []

    for i, name in enumerate(used_columns):
        name = name.replace('circumplex.', '')
        name = name.replace('appCat.', '')
        name = name.replace('appCat.', '')
        name = name.replace('prev_.', 'prev')
        name = name.replace('late_night', 'LN')
        name = name.replace('absolute_change', 'abs_diff')
        name = name.replace('pct_change', 'pct')

        used_columns_list.append(name)
    return used_columns_list

### Initialise

In [5]:
#maximum number of models to try (runtime)
max_models = 10

#set folds and equal trainingsize to testsize (balanced training)
# n_splits = 3
# t_size = len(y) // (n_splits + 1)
# tscv = TimeSeriesSplit(n_splits=n_splits, max_train_size=t_size)


n_splits = 5
t_size = len(y) // (n_splits + 1)
tscv_gridsearch = TimeSeriesSplit(n_splits=n_splits, max_train_size=t_size)

# Initialize time series cross-validation object

for train_index, test_index in tscv_gridsearch.split(X):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216
TRAIN: 216 TEST: 216


### Evaluation


In [6]:
def evaluation(true, pred):
    acc = accuracy_score(true, pred)
    f1= f1_score(true, pred, average='weighted')

    return {'accuracy': acc, 'f1': f1}

### Models

In [7]:
# from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

In [8]:
def cross_validate(X, y, models_collection, tscv, evaluation):

    # Initialize empty list to store cross-validation results
    cv_results = dict(list())
    saved_models = []

    # Loop through each cross-validation fold and fit model on training data
    for train_index, test_index in tscv.split(X):
        # Split data into training and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Loop over all models
        for model_name, model in models_collection.items():

            # Fit model on training data
            model.fit(X_train, y_train)

            # Predict target variable on test data
            y_pred = model.predict(X_test)

            # Compute f1-score on test data
            score = evaluation(y_test, y_pred)

            # Append cross-validation result to list
            if model_name in cv_results:
                cv_results[model_name].append(score)
            else:
                cv_results[model_name] = [score]

            #save model
            saved_models.append(model)

    return cv_results, saved_models

In [9]:
#hyperparameter tuning
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()

param_grid_big = {'n_estimators': [100, 200, 500, 1000],
                'max_depth': [10, 20, 50, ],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'bootstrap': [True, False]}

param_grid_small = {'n_estimators': [100, 200, 500],
                'max_depth': [5, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'bootstrap': [True, False]}

In [10]:
#split dataset in 3 parts: fold1, fold2, fold3
#train on fold1+2, test on fold3

def hyperparameter_tuning(X, y, estimator, tscv, param_grid, evaluation):
    # Initialize empty list to store cross-validation results
    cv_results = dict(list())
    saved_models = []

    # Loop through each cross-validation fold and fit model on training data
    for fold_nr, (train_index, test_index) in enumerate(tscv.split(X)):
        # Split data into training and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]


        # Fit model on training data
        grid_search = GridSearchCV(estimator, param_grid, cv=tscv, scoring='f1_weighted', n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Predict target variable on test data
        y_pred = grid_search.predict(X_test)

        # Compute f1-score on test data
        score = evaluation(y_test, y_pred)
        pprint(score)

        # Append cross-validation result to list
        save_as = f'grid_search_{fold_nr}'

        cv_results[save_as] = score

        #save model
        saved_models.append(grid_search)

    return cv_results, saved_models

In [11]:
train_size = int(len(df) * 0.8)
test_size = len(df) - train_size

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_test = X.iloc[train_size:]
y_test = y.iloc[train_size:]

#print sizes
print('train size:', len(X_train))
print('test size:', len(X_test))


train size: 1036
test size: 260


In [12]:

cv_results, saved_models =  hyperparameter_tuning(X_train, y_train, RandomForestClassifier(), tscv_gridsearch, param_grid_big, evaluation)

{'accuracy': 0.38372093023255816, 'f1': 0.32525665670685416}
{'accuracy': 0.5232558139534884, 'f1': 0.41054866257543915}
{'accuracy': 0.5290697674418605, 'f1': 0.4000147890708767}
{'accuracy': 0.5, 'f1': 0.3444664357578756}
{'accuracy': 0.3953488372093023, 'f1': 0.32843582927611525}


In [13]:
import os

#anounce aloud when finished
os.system('say "your program has finished"')

0

#### Optimised params

In [18]:
#make table of best parameters
best_params = []
for i in range(len(saved_models)):
    best_params.append(saved_models[i].best_params_)
best_params = pd.DataFrame(best_params)

#add which fold it was
best_params['fold_nr'] = [f'{i}' for i in range(len(saved_models))]

#add f1 score
best_params['f1'] = [round(cv_results[f'grid_search_{i}']['f1'], 2) for i in range(len(saved_models))]

#reorder
best_params = best_params[['fold_nr', 'bootstrap', 'max_depth', 'min_samples_leaf', 'min_samples_split', 'n_estimators', 'f1']]
best_params

Unnamed: 0,fold_nr,bootstrap,max_depth,min_samples_leaf,min_samples_split,n_estimators,f1
0,0,False,10,1,10,100,0.33
1,1,True,20,1,5,200,0.41
2,2,False,20,4,5,100,0.4
3,3,True,50,2,2,100,0.34
4,4,False,20,2,2,100,0.33


In [26]:
# 0   fold_nr	bootstrap	max_depth	min_samples_leaf	min_samples_split	n_estimators	f1
# 0	0	False	10	1	10	100	0.33
# 1	1	True	20	1	5	200	0.41
# 2	2	False	20	4	5	100	0.40
# 3	3	True	50	2	2	100	0.34
# 4	4	False	20	2	2	100	0.33
best = {'bootstrap': False,
 'max_depth': 24,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 123}

xx = [str(val) for val in best.values()]

print('\t'.join(xx))

False	24	2	5	123


In [20]:
#find best parameters for whole dataset by wheigting the f1 scores
optimised_params = dict()
for param in best_params.columns:
    weighted_values = []
    if param != 'fold_nr' and param != 'f1':
        for i, value in enumerate(best_params[param]):
            weighted_value = best_params['f1'][i] * value
            weighted_values.append(weighted_value)
    
        optimised_params[param] = int(round(sum(weighted_values) / sum(best_params['f1']), 0))

#map binary values to True/False
optimised_params['bootstrap'] = optimised_params['bootstrap'] == 1

optimised_params

{'bootstrap': False,
 'max_depth': 24,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 123}

### Find results on test set

In [22]:
#use optimised params for final model
final_model = RandomForestClassifier(**optimised_params)
final_model.fit(X_train, y_train)

#predict on test set
#predict on test set
y_pred = final_model.predict(X_test)

pprint(evaluation(y_test, y_pred))

#write classification report
print(classification_report(y_test, y_pred))

{'accuracy': 0.5153846153846153, 'f1': 0.3870730068119778}
              precision    recall  f1-score   support

         5.0       0.00      0.00      0.00         7
         6.0       0.00      0.00      0.00        51
         7.0       0.53      0.94      0.68       138
         8.0       0.29      0.07      0.11        59
         9.0       0.00      0.00      0.00         5

    accuracy                           0.52       260
   macro avg       0.16      0.20      0.16       260
weighted avg       0.35      0.52      0.39       260



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Best Features Finder

In [114]:
pretty_columns = prettify_column_names(used_columns)

#initialize
rf = saved_models[0]

# Get feature importances
importances = rf.feature_importances_

# Sort feature importances in descending order
indices_of_sorted_cols = np.argsort(importances)[::-1]

# Print feature ranking
print("Feature ranking:")

feats_and_importances = {}

for i in range(X.shape[1]):
    feats_and_importances[pretty_columns[indices_of_sorted_cols[i]]] = []

    tabs = '\t' * (4 - round(len(pretty_columns[indices_of_sorted_cols[i]]) / 8))

    print(f"{i+1}.\t {pretty_columns[indices_of_sorted_cols[i]]}{tabs} ({importances[indices_of_sorted_cols[i]]})")


AttributeError: 'GridSearchCV' object has no attribute 'feature_importances_'

In [118]:
good_features = [used_columns[indices_of_sorted_cols[i]] for i in range(X.shape[1]) if importances[indices_of_sorted_cols[i]] > 0.005]

#save the good features to a file
with open('good_features.txt', 'w') as f:
    for item in good_features:
        f.write(item + '\n')

new_df = df[good_features]
new_df['mood_target'] = df['mood_target']
#save as csv
new_df.to_csv('Datasets/df_good_features.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['mood_target'] = df['mood_target']


In [46]:

for rf in saved_models:

    # Get feature importances
    importances = rf.feature_importances_

    # Sort feature importances in descending order
    indices_of_sorted_cols = np.argsort(importances)[::-1]

    # Print feature ranking
    print("Feature ranking:")

    for i in range(X.shape[1]):
        
        feats_and_importances[pretty_columns[indices_of_sorted_cols[i]]].append(importances[indices_of_sorted_cols[i]])

        tabs = '\t' * (4 - round(len(pretty_columns[indices_of_sorted_cols[i]]) / 8))

        print(f"{i+1}.\t {pretty_columns[indices_of_sorted_cols[i]]}{tabs} ({importances[indices_of_sorted_cols[i]]})")


Feature ranking:
1.	 activity_prev_3		 (0.027489160149106438)
2.	 screen			 (0.02665165524653254)
3.	 builtin			 (0.023001574308468095)
4.	 arousal			 (0.022096436523124402)
5.	 arousal_abs_diff_prev_2	 (0.02061001098656542)
6.	 activity_abs_diff		 (0.020085875899280643)
7.	 arousal_abs_diff_prev_3	 (0.01955207816550122)
8.	 valence_prev_3		 (0.018963757446203246)
9.	 mood_pct_prev_6		 (0.018286995662251705)
10.	 arousal_abs_diff_prev_1	 (0.018048206943516998)
11.	 activity_prev_1		 (0.01792192728470616)
12.	 valence_abs_diff		 (0.017257332688529987)
13.	 arousal_abs_diff_prev_4	 (0.017222855378867266)
14.	 activity_abs_diff_prev_1	 (0.017214639965797945)
15.	 communication		 (0.017184898841414095)
16.	 activity_abs_diff_prev_2	 (0.0168826325265078)
17.	 activity_abs_diff_prev_4	 (0.016594989603824845)
18.	 activity_prev_2		 (0.01655242090316779)
19.	 activity_prev_4		 (0.016210176491805336)
20.	 mood_prev_7			 (0.015902019154442868)
21.	 valence_abs_diff_prev_2	 (0.015837318699511303)

In [47]:
#sum up importances
for key in feats_and_importances.keys():
    feats_and_importances[key] = sum(feats_and_importances[key]) / len(feats_and_importances[key])

#sort
sorted_feats_and_importances = {k: v for k, v in sorted(feats_and_importances.items(), key=lambda item: item[1], reverse=True)}

In [50]:
from pprint import pprint
pprint(sorted_feats_and_importances, sort_dicts=False)

{'activity_prev_3': 0.02748916014910644,
 'screen': 0.02665165524653254,
 'builtin': 0.023001574308468095,
 'arousal': 0.022096436523124402,
 'arousal_abs_diff_prev_2': 0.02061001098656542,
 'activity_abs_diff': 0.020085875899280643,
 'arousal_abs_diff_prev_3': 0.01955207816550122,
 'valence_prev_3': 0.018963757446203246,
 'mood_pct_prev_6': 0.018286995662251705,
 'arousal_abs_diff_prev_1': 0.018048206943516998,
 'activity_prev_1': 0.01792192728470616,
 'valence_abs_diff': 0.017257332688529987,
 'arousal_abs_diff_prev_4': 0.017222855378867266,
 'activity_abs_diff_prev_1': 0.017214639965797945,
 'communication': 0.017184898841414095,
 'activity_abs_diff_prev_2': 0.0168826325265078,
 'activity_abs_diff_prev_4': 0.016594989603824845,
 'activity_prev_2': 0.01655242090316779,
 'activity_prev_4': 0.016210176491805336,
 'mood_prev_7': 0.015902019154442868,
 'valence_abs_diff_prev_2': 0.015837318699511303,
 'other': 0.015556033852024715,
 'mood_abs_diff_prev_6': 0.01527428751097433,
 'valence'

In [14]:
#best model
best_res = 0.0
best_model = ''
for name, res in avg_f1.items():
    if res > best_res:
        best_res = res
        best_model = models_collection[name]
best_model, best_res

NameError: name 'avg_f1' is not defined