In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import tensorflow as tf
from tensorflow import keras
from joblib import dump, load
from preprocessing import get_features, time_to_seconds


%matplotlib inline

In [2]:
with open('processed\BadTracksHackaton1801.txt', 'rb') as f:
    bad1801 = np.load(f)
with open('processed\GoodTracksHackaton1801.txt', 'rb') as f:
    good1801 = np.load(f)
all1801 = np.vstack((bad1801, good1801))
all1801.shape
labels1801 = np.hstack((np.ones(bad1801.shape[0]), np.zeros(good1801.shape[0])))

In [3]:
with open('processed\BadTracksHackaton2301.txt', 'rb') as f:
    bad2301 = np.load(f)
with open('processed\GoodTracksHackaton2301.txt', 'rb') as f:
    good2301 = np.load(f)
all2301 = np.vstack((bad2301, good2301))
all2301.shape
labels2301 = np.hstack((np.ones(bad2301.shape[0]), np.zeros(good2301.shape[0])))

In [4]:
with open('processed\BadTracksHackaton2101.txt', 'rb') as f:
    bad2101 = np.load(f)
with open('processed\GoodTracksHackaton2101.txt', 'rb') as f:
    good2101 = np.load(f)
all2101 = np.vstack((bad2101, good2101))
all2101.shape
labels2101 = np.hstack((np.ones(bad2101.shape[0]), np.zeros(good2101.shape[0])))

In [9]:
all_data = np.vstack((all1801, all2101, all2301))
all_labels = np.hstack((labels1801, labels2101, labels2301))
all_bad = np.vstack((bad1801, bad2101, bad2301))
all_bad_labels = np.ones(all_bad.shape[0])
all_good = np.vstack((good1801, good2101, good2301))
all_good_labels = np.zeros(all_good.shape[0])

In [7]:
data = pd.read_csv(r'test\test.txt', sep=' ', header=None, names=['time', 'id', 'latitude', 'longitude', 'elevation', 'code', 'name'])
data['time'] = data['time'].apply(time_to_seconds)
grouped = data.groupby('id')
grouped_df = [i for i in grouped]

test_features = get_features(grouped_df)

In [11]:
type(data['id'].unique().shape[0])

int

In [39]:
X_train, X_test, y_train, y_test = train_test_split(all2101, labels2101, test_size=0.3, shuffle=True, stratify=labels2101)

## Regular tree

In [7]:
tree = DecisionTreeClassifier()
tree = tree.fit(X_train, y_train)

In [8]:
accuracy_score(y_test, tree.predict(X_test))

0.8569512740277156

## Boosted tree

In [9]:
xgboost = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5)
xgboost = xgboost.fit(X_train, y_train)

In [10]:
accuracy_score(y_test, xgboost.predict(X_test))

0.8877961555654895

In [6]:
# parameters = {'learning_rate': np.linspace(0.05, 1, 20),
#               'n_estimators': [190, 200],
#               'subsample': np.linspace(0.1, 1., 10),
#               'max_depth': [5]
#              }
# best_xgboost = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=16, verbose=4)
# best_xgboost.fit(X_train, y_train)
best_xgboost = load('models/xgboost1.joblib')

In [8]:
best_xgboost.best_params_

{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 190, 'subsample': 0.4}

In [9]:
best_xgboost.predict(all_data)

array([1., 0., 1., ..., 0., 0., 0.])

In [199]:
# dump(best_xgboost, 'models/xgboost1.joblib')

['models/xgboost1.joblib']

In [15]:
round(accuracy_score(all_labels, best_xgboost.predict(all_data)), 4) * 100

93.27

In [16]:
round(accuracy_score(all_bad_labels, best_xgboost.predict(all_bad)), 4) * 100

92.05

In [17]:
round(accuracy_score(all_good_labels, best_xgboost.predict(all_good)), 4) * 100

94.43

In [18]:
round(precision_score(all_labels, best_xgboost.predict(all_data)), 4) * 100

93.99

In [19]:
round(recall_score(all_labels, best_xgboost.predict(all_data)), 4) * 100

92.05

## Neural network

In [171]:
model = keras.Sequential([
    keras.layers.InputLayer((7)),
    keras.layers.Dense(10, activation='relu', kernel_regularizer=keras.regularizers.L2(0.1)),
    keras.layers.GaussianDropout(0.15)
    keras.layers.Dense(5, activation='relu'),
#     keras.layers.PReLU(),
    keras.layers.Dense(2)
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])


In [172]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_31 (Dense)             (None, 10)                80        
_________________________________________________________________
dense_32 (Dense)             (None, 5)                 55        
_________________________________________________________________
dense_33 (Dense)             (None, 2)                 12        
Total params: 147
Trainable params: 147
Non-trainable params: 0
_________________________________________________________________


In [173]:
model.fit(X_train, y_train, epochs=200, verbose=2, batch_size=256, validation_data=(X_test, y_test))

Epoch 1/200
18/18 - 0s - loss: 386.9106 - accuracy: 0.6263 - val_loss: 324.3986 - val_accuracy: 0.6424
Epoch 2/200
18/18 - 0s - loss: 296.2308 - accuracy: 0.6708 - val_loss: 244.4731 - val_accuracy: 0.6696
Epoch 3/200
18/18 - 0s - loss: 222.3931 - accuracy: 0.6888 - val_loss: 178.6224 - val_accuracy: 0.7076
Epoch 4/200
18/18 - 0s - loss: 157.6893 - accuracy: 0.7318 - val_loss: 113.3883 - val_accuracy: 0.7631
Epoch 5/200
18/18 - 0s - loss: 65.2263 - accuracy: 0.7410 - val_loss: 26.2807 - val_accuracy: 0.5306
Epoch 6/200
18/18 - 0s - loss: 32.3495 - accuracy: 0.6880 - val_loss: 36.6195 - val_accuracy: 0.7068
Epoch 7/200
18/18 - 0s - loss: 36.7954 - accuracy: 0.6926 - val_loss: 32.1293 - val_accuracy: 0.6817
Epoch 8/200
18/18 - 0s - loss: 28.6141 - accuracy: 0.6774 - val_loss: 21.4981 - val_accuracy: 0.6737
Epoch 9/200
18/18 - 0s - loss: 17.7006 - accuracy: 0.6437 - val_loss: 9.8223 - val_accuracy: 0.6285
Epoch 10/200
18/18 - 0s - loss: 6.3003 - accuracy: 0.5941 - val_loss: 0.9930 - val_a

<tensorflow.python.keras.callbacks.History at 0x1b975215d00>