In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import dask
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report


In [None]:
# read daily data
processed_dfs = []
for filename in sorted(os.listdir('data/aggregated_individual_data/')):
    with open('data/aggregated_individual_data/'+filename, 'r') as f: # open in readonly mode
        df = pd.read_csv(f)
        df.drop('Unnamed: 0', inplace=True, axis = 1)
        processed_dfs.append(df)

In [None]:
example = processed_dfs[1].copy()
example.head()

In [None]:
vars = ['time', 'weekday', 'mood', 'circumplex.arousal', 'circumplex.valence',
       'activity', 'screen', 'call', 'sms', 'appCat.builtin',
       'appCat.communication', 'appCat.entertainment', 'appCat.finance',
       'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
       'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'mood_2days', 'mood_yesterday', '1day_mood_change', 'mood_target', 'target_bw' ]

forest_data = pd.DataFrame(columns = vars)

n_days = 5

for i, df_org in enumerate(processed_dfs):
    df = df_org.copy()

    for j, col in enumerate(example.columns[2:len(vars)]):
        df[col] = (df[col]-df[col].min())/(df[col].max() - df[col].min())
        df[col] = df[col].fillna(0)
        
    for j in range(len(df)-n_days-1):
        means = pd.DataFrame(df.iloc[j:j+n_days, 2:len(example.columns)].mean())
        forest_data = pd.concat([forest_data, means.transpose()], ignore_index=True)
        forest_data.iloc[-1, 0] = df['time'].iloc[j+n_days+1]
        forest_data.iloc[-1, 1] = df['weekday'].iloc[j+n_days+1]
        forest_data.iloc[-1, -5] = df['mood'].iloc[j+n_days-1]
        forest_data.iloc[-1, -4] = df['mood'].iloc[j+n_days]
        forest_data.iloc[-1, -2] = df['mood'].iloc[j+n_days+1]


# delete columns with too many nan or 0 values
for col in forest_data.columns:
    if (sum(forest_data[col] == 0)/len(forest_data[col])) > 0.5:
        forest_data = forest_data.drop(col, axis = 1)

forest_data['target_bw'] = forest_data['mood_target'] - forest_data['mood_yesterday']
forest_data['1day_mood_change'] = forest_data['mood_yesterday'] - forest_data['mood_2days']

In [None]:
forest_data['sms'].hist()

In [None]:
# labels = ['vl', 'l', 'm', 'h', 'vh']
# # build classes
# for i, col in enumerate(forest_data.columns[2:len(vars)]):
#     forest_data[col+'class'] = pd.cut(forest_data[col], 5, labels=labels)

In [None]:
forest_data.head()

In [None]:
# delete constant columns
# forest_data = forest_data.loc[:, (forest_data != forest_data.iloc[0]).any()] 
# len(forest_data.columns)

In [None]:
wood = ['weekday', 'mood',
       'activity', 'screen', 'call', 'sms', 'mood_2days', 'mood_yesterday', '1day_mood_change']

# X = forest_data.loc[:, wood].copy()
X = forest_data.iloc[:,1:-2].copy()

#labels = ['vl', 'l', 'ml', 'm', 'mh', 'h', 'vh']
labels = ['vl', 'l', 'm', 'h', 'vh']
#labels = ['l', 'm', 'h']
#labels = ['three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
# y = pd.cut(forest_data['mood_target'], 5, labels=labels)
# y = y.to_numpy()
y = forest_data.loc[:, 'mood_target'].copy()

In [None]:
plt.hist(y)

In [None]:
forest_data.iloc[:,1:-2].head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Hyperparameter tuning
param_dist = {'n_estimators': randint(10,100),
              'max_depth': randint(1,15)}

# Create a random forest classifier
rf = RandomForestRegressor(n_jobs = -1)

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=100)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

In [None]:
best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
rf = RandomForestRegressor(max_depth = 5, n_estimators = 50, n_jobs = -1)
#rf = best_rf

In [None]:
rf = rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
y_train_pred = rf.predict(X_train)

In [None]:
from sklearn.metrics import mean_squared_error
MSE_test = mean_squared_error(y_test,y_pred)
MSE_train = mean_squared_error(y_train,y_train_pred)
print(MSE_test)
print(MSE_train)

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

fig = plt.figure()
plt.barh(feature_importances.index,feature_importances.importance)

plt.show()
#fig.savefig('rf_importance.pdf')