In [None]:
import random
import warnings

import numpy as np
import pandas as pd
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import MatplotlibDeprecationWarning

warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)

SEED = 0
DF_PATH = '../input/classification-of-robots-from-their-conversation/Classification of Robots from their conversation sequence.csv'
T_COLS = [f"num{i+1}" for i in range(10)]

%matplotlib inline

In [None]:
df = pd.read_csv(DF_PATH)
df['kind'] = [(i // 5) for i in range(len(df))]
df.head(6)

In [None]:
df.groupby('source').describe()

In [None]:
grad_df = pd.DataFrame(np.gradient(df[T_COLS].to_numpy(), axis=1), columns=[f'grad_{x}' for x in T_COLS])
grad_df['source'] = df['source']
grad_df.groupby('source').describe()

In [None]:
# There is a lot of data - pick some random samples
random.seed(SEED)
SUB_SAMPLE_FRAC = 0.001

kinds = df.kind.unique()

k = int(len(kinds) * SUB_SAMPLE_FRAC)
kinds = random.choices(kinds, k=k)

df = df[df.kind.isin(kinds)]
df.head(6)

In [None]:
# Train test split
TEST_SIZE = 0.2
random.seed(SEED)

kinds = df.kind.unique()
k = int(len(kinds) * TEST_SIZE)
test_kinds = random.choices(kinds, k=k)
train_kinds = kinds[~np.isin(kinds, test_kinds)]

# Split and shuffle (sample(1))
df_train = df[df.kind.isin(train_kinds)].sample(frac=1).copy()
df_test = df[df.kind.isin(test_kinds)].sample(frac=1).copy()

assert len(df) == len(df_train) + len(df_test)

In [None]:
rnd_kinds = random.choices(df_train.kind.unique(), k=5)
temp_df = pd.melt(df_train, id_vars=['source', 'kind'], value_vars=T_COLS, var_name='time')
fig, ax = plt.subplots(1, 5, figsize=(25, 5))
for i, axx in enumerate(ax):
    for kind, gp in temp_df[(temp_df.source == i) & (temp_df.kind.isin(rnd_kinds))].groupby('kind'):
        gp.plot(x='time', y='value', ax=axx, label=kind, title=f'Robot {i}, by example.', sharex=True)

plt.show()        
plt.close()
del temp_df

Distinguisting between 0 vs (1 or 3) vs (2 or 4) seems easy, but distinguishing (1 from 3) or (2 from 4) not as much.

In [None]:
from sklearn.preprocessing import StandardScaler

zscaler = StandardScaler()

# fit and transform the data - we are careful to only fit the scaler on the train data, so as not to cause leakage
df_train[T_COLS] = zscaler.fit_transform(df_train[T_COLS])
df_test[T_COLS] = zscaler.transform(df_test[T_COLS])

In [None]:
temp_df = pd.melt(df_train, id_vars=['source', 'kind'], value_vars=T_COLS, var_name='time')
fig, ax = plt.subplots(1, 5, figsize=(25, 5))
for i, axx in enumerate(ax):
    for kind, gp in temp_df[(temp_df.source == i) & (temp_df.kind.isin(rnd_kinds))].groupby('kind'):
        gp.plot(x='time', y='value', ax=axx, label=kind, title=f'Robot {i}, by example.', sharex=True)

plt.show()
plt.close()
del temp_df

# Feature extraction w/ tsfresh

Extract features with tsfresh, train model.

In [None]:
# Prepare new df for tsfresh
ts_test = pd.melt(df_test, id_vars=['source', 'kind'], value_vars=T_COLS, var_name='time')
ts_train = pd.melt(df_train, id_vars=['source', 'kind'], value_vars=T_COLS, var_name='time')

ts_test.time = ts_test.time.apply(lambda x: int(x.strip('num')))
ts_train.time = ts_train.time.apply(lambda x: int(x.strip('num')))

# We want to predict 'source' as dependent variable, make a new variable and drop it from dataframe.
y_train, y_test = ts_train.source, ts_test.source
ts_train['id'] = ts_train.index
ts_train = ts_train.drop(columns=['source', 'kind'])
ts_test['id'] = ts_test.index
ts_test = ts_test.drop(columns=['source', 'kind'])

print(y_train.values[:12])
ts_train.head(12)

In [None]:
import tsfresh
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters

extraction_settings = ComprehensiveFCParameters()
# extraction_settings = MinimalFCParameters()

x_train = extract_relevant_features(
    ts_train,
    y_train,
    column_id='id',
    column_sort='time',
    default_fc_parameters=extraction_settings,
)

x_train.head(5)

In [None]:
corr = x_train.copy()
corr['y'] = y_train
corr = corr.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
plt.show()
corr['y'].drop('y').abs().sort_values(ascending=False)

In [None]:
train_parameters = tsfresh.feature_extraction.settings.from_columns(x_train)
x_test = extract_features(
    ts_test,
    column_id='id',
    column_sort='time',
    kind_to_fc_parameters=train_parameters
)
x_test.head(2)

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import classification_report as cr, confusion_matrix as cm

from xgboost import XGBClassifier

rf = RandomForestClassifier(random_state=0).fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("RF")
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

rf_bagged = BaggingClassifier(base_estimator=RandomForestClassifier(), random_state=0).fit(x_train, y_train)
y_pred = rf_bagged.predict(x_test)
print("RF Bagged")
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

xgbc = XGBClassifier(use_label_encoder=False).fit(x_train, y_train)
y_pred = xgbc.predict(x_test)
print("XGBC")
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

As expected, the the model is good at predicting 0 vs (1 or 3) vs (2 or 4) but often gets confused between (1 vs 3) or (2 vs 4).

Indeed, Kolmogorv-Smirnov test shows that 1 & 3 are very likely to be the same distribution (high value for all columns) 2 and 4 are different though

https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test

In [None]:
from scipy.stats import ks_2samp

df = pd.read_csv(DF_PATH)

print("Cols 1 vs 3")
for col in T_COLS:
    print(ks_2samp(df[df.source == 1][col].values, df[df.source == 3][col].values))
    
print("\nCols 2 vs 4")
for col in T_COLS:
    print(ks_2samp(df[df.source == 2][col].values, df[df.source == 4][col].values))

In [None]:
x_train = x_train[y_train != 3]
x_test = x_test[y_test != 3]
y_train = y_train[y_train != 3]
y_test = y_test[y_test != 3]

y_train[y_train == 4] = 3
y_test[y_test == 4] = 3


rf = RandomForestClassifier(random_state=0).fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("RF")
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

rf_bagged = BaggingClassifier(base_estimator=RandomForestClassifier(), random_state=0).fit(x_train, y_train)
y_pred = rf_bagged.predict(x_test)
print("RF Bagged")
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

xgbc = XGBClassifier(use_label_encoder=False).fit(x_train, y_train)
y_pred = xgbc.predict(x_test)
print("XGBC")
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

# FCN

https://keras.io/examples/timeseries/timeseries_classification_from_scratch/

In [None]:
from tensorflow import keras

def make_model(input_shape, num_classes, num_layers=1):
    """From https://arxiv.org/abs/1611.06455."""
    input_layer = x = keras.layers.Input(input_shape)

    for _ in range(num_layers):
        x = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(x)
        x = keras.layers.BatchNormalization()(x)
        x = keras.layers.ReLU()(x)
    
    x = keras.layers.Dropout(0.1)(x)
    x = keras.layers.GlobalAveragePooling1D()(x)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(x)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

TEST_SIZE = 0.2

df = pd.read_csv(DF_PATH)

df = df[df.source != 3]  # Remove robot 3, since it's the same as 1, as we already proved
df.loc[df.source == 4, 'source'] = 3 #.values = 3  # Also rename claess 4 to class 3 so we can still use sparsecrossentropy

x, y = df[T_COLS].to_numpy(), df['source'].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=0, test_size=TEST_SIZE, shuffle=True)

zscaler = StandardScaler()

# fit and transform the data - we are careful to only fit the scaler on the train data, so as not to cause leakage
x_train = zscaler.fit_transform(x_train)
x_test = zscaler.transform(x_test)

# Reshape
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))

In [None]:
# Train
LR = 1e-3
EPOCHS = 500
BATCH_SIZE = 128

callbacks = [
    keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True, monitor="val_loss", verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=20, min_lr=1e-7, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, verbose=1),
]

model = make_model(input_shape=x_train.shape[1:], num_classes=len(np.unique(y_train)))

model.compile(
    optimizer=keras.optimizers.Adam(LR),
    loss="sparse_categorical_crossentropy",
    metrics=["sparse_categorical_accuracy"],
)

print(model.summary())

history = model.fit(
    x_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    validation_split=0.2,
    verbose=1,
)

In [None]:
from sklearn.metrics import classification_report as cr, confusion_matrix as cm

model = keras.models.load_model("best_model.h5")

test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy", test_acc)
print("Test loss", test_loss)

y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(cr(y_pred=y_pred, y_true=y_test))
print(cm(y_pred=y_pred, y_true=y_test))

In [None]:
metrics = ["sparse_categorical_accuracy", "loss"]
fig, axs = plt.subplots(1, len(metrics), figsize=(16, 16/len(metrics)))
for metric, ax in zip(metrics, axs):
    ax.plot(history.history[metric])
    ax.plot(history.history["val_" + metric])
    ax.set_title("model " + metric)
    ax.set(ylabel=metric, xlabel='epoch')
    ax.legend(["train", "val"], loc="best")

plt.show()

After deleting robot 3, this model solves this task quite easily.