## Initial imports and data loading

In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

# ===== Data ===== #
DFS = {
    'lag_cp' : pd.read_csv('data/bem_comportadas_laguardia.csv'),
    'lag_li' : pd.read_csv('data/livres_laguardia.csv'),
    'van_cp' : pd.read_csv('data/bem_comportadas_vanessa.csv'),
    'van_li' : pd.read_csv('data/livres_vanessa.csv'),
    'bru_cp' : pd.read_csv('data/bem_comportadas_bruno.csv'),
    'bru_li' : pd.read_csv('data/livres_bruno.csv')
}
DFS_X = {
    'lag_cp' : DFS['lag_cp'].drop('pose', axis=1),
    'lag_li' : DFS['lag_li'].drop('pose', axis=1),
    'van_cp' : DFS['van_cp'].drop('pose', axis=1),
    'van_li' : DFS['van_li'].drop('pose', axis=1),
    'bru_cp' : DFS['bru_cp'].drop('pose', axis=1),
    'bru_li' : DFS['bru_li'].drop('pose', axis=1)
}
DFS_Y = {
    'lag_cp' : DFS['lag_cp']['pose'].astype('string'),
    'lag_li' : DFS['lag_li']['pose'].astype('string'),
    'van_cp' : DFS['van_cp']['pose'].astype('string'),
    'van_li' : DFS['van_li']['pose'].astype('string'),
    'bru_cp' : DFS['bru_cp']['pose'].astype('string'),
    'bru_li' : DFS['bru_li']['pose'].astype('string')
}

## Merging data & recategorizing poses

In [8]:
def recategorize_y(y_vec):
    result = y_vec.copy()
    result[y_vec == '0'] = 'Not Sitting'
    result[y_vec.isin(['1', '2', '12']) ] = 'Sitting Correctly'
    result[y_vec.isin(['3', '6']) ] = 'Leaning Forward'
    result[y_vec == '7'] = 'Leaning Backward'
    result[y_vec.isin(['4', '5', '8', '9', '10', '11']) ] = 'Unbalanced'

    return result

In [30]:
X_train = pd.concat([DFS_X['van_cp'], DFS_X['bru_cp']])
y_train = pd.concat([DFS_Y['van_cp'], DFS_Y['bru_cp']])
y_train = recategorize_y(y_train)

X_test = pd.concat([DFS_X['van_li'], DFS_X['bru_li']])
y_test = pd.concat([DFS_Y['van_li'], DFS_Y['bru_li']])
y_test = recategorize_y(y_test)

In [31]:
y_test.value_counts()

Sitting Correctly    2166
Leaning Forward      2082
Leaning Backward     1179
Unbalanced           1070
Not Sitting             6
Name: pose, dtype: Int64

## Creating the model and checking the results on the test set

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

print('Accuracy on train set: ', accuracy_score(y_train, clf.predict(X_train)))
print('Accuracy on test set: ', accuracy_score(y_test, clf.predict(X_test)))

Accuracy on train set:  0.9996884541092903
Accuracy on test set:  0.9048131631554667


In [33]:
import plotly.express as px

# pandas crosstab
ct = pd.crosstab(y_test, clf.predict(X_test), rownames=['Actual'], colnames=['Predicted'])

# plotly heatmap
fig = px.imshow(ct,
                labels=dict(x="Predicted", y="Actual", color="Counts"),
                color_continuous_scale='Blues')
fig.update_layout(title='Confusion Matrix')
fig.show()

## Creating the model with all data and testing

In [34]:
X_train = pd.concat([DFS_X['van_cp'], DFS_X['bru_cp'], DFS_X['van_li'], DFS_X['bru_li']])
y_train = pd.concat([DFS_Y['van_cp'], DFS_Y['bru_cp'], DFS_Y['van_li'], DFS_Y['bru_li']])
y_train = recategorize_y(y_train)

X_test = pd.concat([DFS_X['van_li'], DFS_X['bru_li']])
y_test = pd.concat([DFS_Y['van_li'], DFS_Y['bru_li']])
y_test = recategorize_y(y_test)

X_test_lag = DFS_X['lag_li']
y_test_lag = DFS_Y['lag_li']
y_test_lag = recategorize_y(y_test_lag)

In [36]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

print('Accuracy on train set: ', accuracy_score(y_train, clf.predict(X_train)))
print('Accuracy on test set: ', accuracy_score(y_test, clf.predict(X_test)))
print('Accuracy on laguardia set: ', accuracy_score(y_test_lag, clf.predict(X_test_lag)))

Accuracy on train set:  0.9997339482085846
Accuracy on test set:  1.0
Accuracy on laguardia set:  0.729271523178808


In [39]:
# pandas crosstab
ct = pd.crosstab(y_test_lag, clf.predict(X_test_lag), rownames=['Actual'], colnames=['Predicted'])

# plotly heatmap
fig = px.imshow(ct,
                labels=dict(x="Predicted", y="Actual", color="Counts"),
                color_continuous_scale='Blues')
fig.update_layout(title='Confusion Matrix on Laguardia')
fig.show()

## Exporting the model

In [40]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)