# Model training

In [318]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from tabulate import tabulate
from sklearn.metrics import mean_squared_error
import math
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay


In [319]:
#debugging stuff
def evaluate_stress_predictions(predicted_values,true_values):
    # Initialize counters
    correct_predictions = 0
    overestimated = 0
    underestimated = 0

    # Compare true and predicted values
    for true, pred in zip(true_values, predicted_values):
        if true == pred:
            correct_predictions += 1
        elif true == 'low' and pred == 'medium':
            underestimated += 1
        elif true == 'medium' and pred == 'low':
            overestimated += 1
        elif true == 'medium' and pred == 'high':
            underestimated += 1
        elif true == 'high' and pred == 'medium':
            overestimated += 1
        elif true == 'high' and pred == 'low':
            overestimated += 1
        elif true == 'low' and pred == 'high':
            underestimated += 1

    # Return counts
    return correct_predictions, overestimated, underestimated
    

def evaluate_age_predictions(predicted_values, true_values, error):
    # Initialize counters
    correct_predictions = 0
    overestimated = 0
    underestimated = 0
    
    # Compare true and predicted values
    for true, pred in zip(true_values, predicted_values):
        true = round(true)
        pred = round(pred)
        if abs(pred - true) <= error:  # Check if the absolute difference is within the error
            correct_predictions += 1
        elif pred > true:
            overestimated += 1
        else:
            underestimated += 1
    
    return correct_predictions, overestimated, underestimated


def load_dataset():
    train_data = pd.read_csv('./dataset/train_dataset.csv')
    test_data = pd.read_csv('./dataset/test_dataset.csv')
    # Preprocess training and testing dataset
    train_data['bedtime'] = pd.to_datetime(train_data['bedtime']).dt.hour
    train_data['activity_intensity'] = train_data['activity_intensity'].map({'low': 0, 'moderate': 1, 'high': 2})

    test_data['bedtime'] = pd.to_datetime(test_data['bedtime']).dt.hour
    test_data['activity_intensity'] = test_data['activity_intensity'].map({'low': 0, 'moderate': 1, 'high': 2})
    return train_data, test_data


train_data, test_data = load_dataset()

def get_vars(c="stress"):
    #train_data, test_data = load_dataset()
    
    if(c == "stress"):
        train_features = train_data[['bedtime', 'activity_intensity', 'byear']]
        test_features = test_data[['bedtime', 'activity_intensity', 'byear']]
        
        train_target = train_data['stress_level']
        test_target = test_data['stress_level']
      
        
    else:
        train_features = train_data[['stress', 'activity_intensity', 'byear']]
        test_features = test_data[['stress', 'activity_intensity', 'byear']]
        
        train_target = train_data['mental_age']
        test_target = test_data['mental_age']
        
        
    return train_features, train_target, test_features, test_target

def print_stats(pred,test,banner,c="stress"):
    '''
    try:
        head=['Accuracy',accuracy_score(pred,test)]
    except:
        mse = round(mean_squared_error(test, pred),2)
        head= ['Mean Squared Error', str(mse) + " ("+str(math.sqrt(mse))+")"]
    '''
    if(c== "stress"):
        head=['Accuracy',accuracy_score(pred,test)]
    else:
        mse = round(mean_squared_error(test, pred),2)
        head= ['Mean Squared Error', str(mse) + " ("+str(math.sqrt(mse))+")"]
    
    if(c=="stress"):
        correct, over, under = evaluate_stress_predictions(pred, test)
    else:
        #correct, over, under = evaluate_age_predictions(pred, test,max(1,int(math.sqrt(mse))))
        correct, over, under = evaluate_age_predictions(pred, test,int(math.sqrt(mse)))
        
    data = [head,
        ["Correct Predictions:", str(correct)+"/"+str(len(test))],
        ["Overestimated:",  str(over)+"/"+str(len(test))],
        ["Underestimated:",  str(under)+"/"+str(len(test))]]
    print("\n\n"+tabulate(data, headers=[banner, ""]))

# Decision tree

### Stress

In [320]:
train_features, train_target, test_features, test_target = get_vars()

clf = DecisionTreeClassifier()
clf.fit(train_features, train_target) #Train the classifier
pred_target = clf.predict(test_features) #Make predictions

print_stats(pred_target,test_target ,"Decision tree stress") #Evaluates


# Hyper parameters
param_grid = {"max_depth":[None,1,2,3,4], 'min_samples_leaf': [1,2,3], 'min_samples_split': [2,3,4]}

prog_tree_sh = GridSearchCV(clf, param_grid=param_grid, cv=5).fit(train_features, train_target)
pred_target = prog_tree_sh.best_estimator_.predict(test_features) #Make predictions

print_stats(pred_target,test_target ,"Decision tree stress best_params") #Evaluates
print(prog_tree_sh.best_params_)
#tree.plot_tree(prog_tree_sh.best_estimator_)



Decision tree stress
----------------------  -------------------
Accuracy                0.42857142857142855
Correct Predictions:    21/49
Overestimated:          22/49
Underestimated:         6/49


Decision tree stress best_params
----------------------------------  ------------------
Accuracy                            0.3877551020408163
Correct Predictions:                19/49
Overestimated:                      25/49
Underestimated:                     5/49
{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


### Mental_age

In [321]:
train_features, train_target, test_features, test_target = get_vars(c="age")

clf = DecisionTreeRegressor(random_state=42)
clf.fit(train_features, train_target) #Train the classifier
pred_target = clf.predict(test_features) #Make predictions

print_stats(test_target,pred_target,"Decision tree mental age","age") #Evaluates

# Hyper parameters
param_grid = {"max_depth":[None,1,2,3,4,5,6,7,8,9], 'min_samples_leaf': [1,2,3,4,5], 'min_samples_split': [2,3,4,5]}

prog_tree_sh = GridSearchCV(clf, param_grid=param_grid, cv=5).fit(train_features, train_target)
pred_target = prog_tree_sh.best_estimator_.predict(test_features) #Make predictions

print_stats(pred_target,test_target ,"Decision tree mental age best_params","age") #Evaluates
print(prog_tree_sh.best_params_)



Decision tree mental age
--------------------------  -------------------------
Mean Squared Error          1.08 (1.0392304845413265)
Correct Predictions:        41/49
Overestimated:              4/49
Underestimated:             4/49


Decision tree mental age best_params
--------------------------------------  -------------------------
Mean Squared Error                      1.11 (1.0535653752852738)
Correct Predictions:                    41/49
Overestimated:                          4/49
Underestimated:                         4/49
{'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [322]:
#clf.get_params()#hyperparam

# Random forest

### Stress

In [323]:
train_features, train_target, test_features, test_target = get_vars()

rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(train_features, train_target) #Train the classifier
pred_target = rf_clf.predict(test_features) #Make predictions

print_stats(test_target, pred_target,"Random forest stress") #Evaluates

# Hyper parameters
param_grid = {"max_depth":[None,1,2,3,4], 'min_samples_leaf': [1,2,3], 'min_samples_split': [2,3,4]}

prog_tree_sh = GridSearchCV(rf_clf, param_grid=param_grid, cv=5).fit(train_features, train_target)
pred_target = prog_tree_sh.best_estimator_.predict(test_features) #Make predictions

    
print_stats(pred_target,test_target ,"Random forest stress best_params") #Evaluates
print(prog_tree_sh.best_params_)



Random forest stress
----------------------  -------------------
Accuracy                0.42857142857142855
Correct Predictions:    21/49
Overestimated:          6/49
Underestimated:         22/49


Random forest stress best_params
----------------------------------  ------------------
Accuracy                            0.4489795918367347
Correct Predictions:                22/49
Overestimated:                      25/49
Underestimated:                     2/49
{'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 2}


### Mental_age

In [324]:
train_features, train_target, test_features, test_target = get_vars(c="age")

rf_clf = RandomForestRegressor(random_state=42)
rf_clf.fit(train_features, train_target) #Train the classifier
pred_target = rf_clf.predict(test_features) #Make predictions

print_stats(test_target, pred_target,"Random forest age","age") #Evaluates

# Hyper parameters
param_grid = {"max_depth":[None,1,2,3,4], 'min_samples_leaf': [1,2,3], 'min_samples_split': [2,3,4]}

prog_tree_sh = GridSearchCV(rf_clf, param_grid=param_grid, cv=5).fit(train_features, train_target)
pred_target = prog_tree_sh.best_estimator_.predict(test_features) #Make predictions


print_stats(pred_target,test_target ,"Random forest mental age best_params", "age") #Evaluates
print(prog_tree_sh.best_params_)



Random forest age
--------------------  ------------------------
Mean Squared Error    1.22 (1.104536101718726)
Correct Predictions:  42/49
Overestimated:        4/49
Underestimated:       3/49


Random forest mental age best_params
--------------------------------------  ------------------------
Mean Squared Error                      1.8 (1.3416407864998738)
Correct Predictions:                    41/49
Overestimated:                          2/49
Underestimated:                         6/49
{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


# Support vector

### Stress

In [325]:
train_features, train_target, test_features, test_target = get_vars()

svm_clf = SVC(kernel='linear', random_state=42)  #You can choose different kernels as well
svm_clf.fit(train_features, train_target) #Train the classifier
pred_target = svm_clf.predict(test_features) #Make predictions

print_stats(test_target, pred_target,"Support Vector stress") #Evaluates




Support Vector stress
-----------------------  ------------------
Accuracy                 0.4489795918367347
Correct Predictions:     22/49
Overestimated:           0/49
Underestimated:          27/49


### Mental_age

In [326]:
train_features, train_target, test_features, test_target = get_vars(c="age")

svm_clf = SVR(kernel='linear') 
svm_clf.fit(train_features, train_target) #Train the classifier
pred_target = svm_clf.predict(test_features) #Make predictions

print_stats(test_target, pred_target,"Support Vector mental age","age") #Evaluates



Support Vector mental age
---------------------------  ------------------------
Mean Squared Error           0.08 (0.282842712474619)
Correct Predictions:         44/49
Overestimated:               2/49
Underestimated:              3/49
