## Initial imports and data loading

In [15]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

# ===== Data ===== #
DFS = {
    'lag_cp' : pd.read_csv('data/bem_comportadas_laguardia.csv'),
    'lag_li' : pd.read_csv('data/livres_laguardia.csv'),
    'van_cp' : pd.read_csv('data/bem_comportadas_vanessa.csv'),
    'van_li' : pd.read_csv('data/livres_vanessa.csv'),
    'bru_cp' : pd.read_csv('data/bem_comportadas_bruno.csv'),
    'bru_li' : pd.read_csv('data/livres_bruno.csv'),
    'alm_cp' : pd.read_csv('data/bem_comportadas_almir.csv'),
}
DFS_X = {
    'lag_cp' : DFS['lag_cp'].drop('pose', axis=1),
    'lag_li' : DFS['lag_li'].drop('pose', axis=1),
    'van_cp' : DFS['van_cp'].drop('pose', axis=1),
    'van_li' : DFS['van_li'].drop('pose', axis=1),
    'bru_cp' : DFS['bru_cp'].drop('pose', axis=1),
    'bru_li' : DFS['bru_li'].drop('pose', axis=1),
    'alm_cp' : DFS['alm_cp'].drop('pose', axis=1),
}
DFS_Y = {
    'lag_cp' : DFS['lag_cp']['pose'].astype('string'),
    'lag_li' : DFS['lag_li']['pose'].astype('string'),
    'van_cp' : DFS['van_cp']['pose'].astype('string'),
    'van_li' : DFS['van_li']['pose'].astype('string'),
    'bru_cp' : DFS['bru_cp']['pose'].astype('string'),
    'bru_li' : DFS['bru_li']['pose'].astype('string'),
    'alm_cp' : DFS['alm_cp']['pose'].astype('string'),
}

## Merging data & recategorizing poses

In [16]:
def recategorize_y(y_vec):
    result = y_vec.copy()
    result[y_vec == '0'] = 'Not Sitting'
    result[y_vec.isin(['1', '2', '12']) ] = 'Sitting Correctly'
    result[y_vec.isin(['3', '6']) ] = 'Leaning Forward'
    result[y_vec == '7'] = 'Leaning Backward'
    result[y_vec.isin(['4', '5', '8', '9', '10', '11']) ] = 'Unbalanced'

    return result

In [17]:
X_train = pd.concat([DFS_X['van_cp'], DFS_X['bru_cp']])
y_train = pd.concat([DFS_Y['van_cp'], DFS_Y['bru_cp']])
y_train = recategorize_y(y_train)

X_test = pd.concat([DFS_X['van_li'], DFS_X['bru_li']])
y_test = pd.concat([DFS_Y['van_li'], DFS_Y['bru_li']])
y_test = recategorize_y(y_test)

In [18]:
y_test.value_counts()

Incorrectly Sitting    4331
Sitting Correctly      2172
Name: pose, dtype: Int64

## Creating the model and checking the results on the test set

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

print('Accuracy on train set: ', accuracy_score(y_train, clf.predict(X_train)))
print('Accuracy on test set: ', accuracy_score(y_test, clf.predict(X_test)))

Accuracy on train set:  0.9998130724655742
Accuracy on test set:  0.8108565277564201


In [20]:
import plotly.express as px

# pandas crosstab
ct = pd.crosstab(y_test, clf.predict(X_test), rownames=['Actual'], colnames=['Predicted'])

# plotly heatmap
fig = px.imshow(ct,
                labels=dict(x="Predicted", y="Actual", color="Counts"),
                color_continuous_scale='Blues')
fig.update_layout(title='Confusion Matrix')
fig.show()

## Creating the model with all data and testing

In [21]:
X_train = pd.concat([DFS_X['van_cp'], DFS_X['bru_cp'], DFS_X['van_li'], DFS_X['bru_li']])
y_train = pd.concat([DFS_Y['van_cp'], DFS_Y['bru_cp'], DFS_Y['van_li'], DFS_Y['bru_li']])
X_train = X_train.values
y_train = recategorize_y(y_train)

X_test = pd.concat([DFS_X['van_li'], DFS_X['bru_li']])
y_test = pd.concat([DFS_Y['van_li'], DFS_Y['bru_li']])
X_test = X_test.values
y_test = recategorize_y(y_test)

X_test_lag = DFS_X['lag_li']
y_test_lag = DFS_Y['lag_li']
X_test_lag = X_test_lag.values
y_test_lag = recategorize_y(y_test_lag)

In [22]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

print('Accuracy on train set: ', accuracy_score(y_train, clf.predict(X_train)))
print('Accuracy on test set: ', accuracy_score(y_test, clf.predict(X_test)))
print('Accuracy on laguardia set: ', accuracy_score(y_test_lag, clf.predict(X_test_lag)))

Accuracy on train set:  0.9998226321390564
Accuracy on test set:  1.0
Accuracy on laguardia set:  0.743841059602649


In [23]:
# pandas crosstab
ct = pd.crosstab(y_test_lag, clf.predict(X_test_lag), rownames=['Actual'], colnames=['Predicted'])

# plotly heatmap
fig = px.imshow(ct,
                labels=dict(x="Predicted", y="Actual", color="Counts"),
                color_continuous_scale='Blues')
fig.update_layout(title='Confusion Matrix on Laguardia')
fig.show()

## Exporting the model

In [24]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Testing the model with Almir data

In [25]:
df_almir = pd.read_csv('data/bem_comportadas_almir.csv')

X_almir = df_almir.drop(['pose'], axis=1)
X_almir = X_almir.values
y_almir = df_almir['pose'].astype('string')
y_almir = recategorize_y(y_almir)

clf.predict(X_almir)

array(['Sitting Correctly', 'Sitting Correctly', 'Sitting Correctly', ...,
       'Incorrectly Sitting', 'Incorrectly Sitting', 'Sitting Correctly'],
      dtype=object)

In [26]:
ct = pd.crosstab(y_almir, clf.predict(X_almir), rownames=['Actual'], colnames=['Predicted'])
fig = px.imshow(ct,
                labels=dict(x="Predicted", y="Actual", color="Counts"),
                color_continuous_scale='Blues')
fig.update_layout(title='Confusion Matrix on Almir')
fig.show()

In [27]:
np.mean(y_almir == clf.predict(X_almir))

0.700445142707515

# Confusion Matrix by pose

In [28]:
names = ['Laguardia', 'Vanessa', 'Bruno', 'Almir']
nicknames = ['lag', 'van', 'bru', 'alm']

for name, nickname in zip(names, nicknames):
    y_test = DFS_Y[nickname+'_cp'].astype(int)
    X_test = DFS_X[nickname+'_cp'].values

    y_pred = clf.predict(X_test)

    ct = pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'])

    fig = px.imshow(ct,
                labels=dict(x="Predicted", y="Actual", color="Counts"),
                color_continuous_scale='Blues')
    fig.update_layout(title=f'Confusion Matrix on {name}')
    fig.show()

    