##### Imports & Constants

In [None]:
import pathlib
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import sklearn.ensemble as ske
import sklearn.metrics as skm

FINAL_DATA_PATH = pathlib.Path('..') / 'data' / 'final' / 'final_data.csv'
PCA_DATA_PATH = pathlib.Path('..') / 'data' / 'final' / 'pca_data.csv'

final_data = pd.read_csv(FINAL_DATA_PATH)
pca_data = pd.read_csv(PCA_DATA_PATH)

X, y = final_data.drop('position', axis=1), final_data['position']
pca_X, pca_y = pca_data, final_data['position']

# Split data into train and test sets
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
pca_X_train, pca_X_test, pca_y_train, pca_y_test = ms.train_test_split(pca_X, pca_y, test_size=0.2, random_state=42, shuffle=True)

# Split train data into train and validation sets
X_train, X_val, y_train, y_val = ms.train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)

X_train_features_count = X_train.shape[1]
pca_X_train_features_count = pca_X_train.shape[1]

##### Inital Model

In [None]:
rf = ske.RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

""" 
0 -> ST
1 -> LW
2 -> RW
3 -> CM
4 -> RB
5 -> LB
6 -> CB
"""

# Metrics
print(skm.classification_report(y_test, y_pred))

In [None]:
feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
sns.barplot(x=feature_importance, y=feature_importance.index)

##### Inital Model (PCA)

In [None]:
rf = ske.RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(pca_X_train, pca_y_train)
y_pred = rf.predict(pca_X_test)

""" 
0 -> ST
1 -> LW
2 -> RW
3 -> CM
4 -> RB
5 -> LB
6 -> CB
"""

# Metrics
print(skm.classification_report(pca_y_test, y_pred))


In [None]:
feature_importance = pd.Series(rf.feature_importances_, index=pca_X.columns).sort_values(ascending=False)
sns.barplot(x=feature_importance, y=feature_importance.index)

In [None]:
lr = lm.LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs')

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Metrics
print(skm.classification_report(pca_y_test, y_pred))

## Tensorflow softmax ANN

In [None]:
""" 
This model was tested with different number of layers and neurons per layer.
The best result was achieved with the following configuration:
    - 5 layers
    - 256 neurons in the first layer
    - 128 neurons in the second layer
    - 64 neurons in the third layer
    - 32 neurons in the fourth layer
    - 7 neurons in the fifth layer

The model was trained with 100 epochs and a batch size of 32.
"""


import tensorflow as tf

# read model

model = None
try:
    model = tf.keras.models.load_model('model.h5')
except:
    pass

if not model:
    
    input_size = X_train_features_count

    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(input_size, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(7, activation='softmax')
    ])

    # compile model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # train model of 160000 rows and 24 columns
    model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_val, y_val)
    )

# evaluate model
model.evaluate(X_test, y_test)

In [None]:
# Test model
y_pred = model.predict(X_test)

# see probabilities for each class for the first 10 rows
y_pred[:10] 
 
# n

In [None]:
# descirbe model
model.summary()

# save model
model.save('model.h5')