In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [3]:
from sktime.classification.compose import (
    ColumnEnsembleClassifier,
    TimeSeriesForestClassifier,
)
from sktime.classification.dictionary_based import BOSSEnsemble
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.transformations.panel.tsfresh import TSFreshFeatureExtractor

from catboost import CatBoostClassifier

In [90]:
import os
files = []
for dirname, _, filenames in os.walk('../data/raw/second_version/data_train/data_train/'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

In [91]:
ordered_list_of_files = [file.split('/')[-1] for file in files]

In [92]:
y = pd.read_csv('../data/raw/second_version/train.csv', 
                index_col='id').loc[ordered_list_of_files].values.ravel()

In [93]:
list_of_dfs = []
for file in files:
    list_of_cols = [pd.Series(col) for col in pd.read_csv(file).T.values]
    list_of_cols.append(pd.read_csv(file).T.values[3] / 
                        pd.read_csv(file).T.values[2])
    list_of_cols.append(pd.read_csv(file).T.values[3] / 
                        pd.read_csv(file).T.values[1])
    list_of_cols.append(pd.read_csv(file).T.values[3] / 
                        pd.read_csv(file).T.values[0])
    list_of_dfs.append(list_of_cols)

In [94]:
X = pd.DataFrame(list_of_dfs)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1575, 7) (1575,) (525, 7) (525,)


### Time series concatenation + TimeSeriesForestClassifier

In [77]:
# , class_weight=dict(zip([1,2,3,4],(pd.Series(y).value_counts()/len(y)).values))

In [79]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", TimeSeriesForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)),
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9695238095238096

In [41]:
# y_pred = clf.predict(X_test)

# from sklearn.metrics import classification_report
# print('\nClassification Report\n')
# print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3', 'Class 4']))

### Time series concatenation + ColumnEnsembleClassifier

In [89]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", ColumnEnsembleClassifier()),
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

TypeError: __init__() missing 1 required positional argument: 'estimators'

### Column ensembling

In [231]:
clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF0", TimeSeriesForestClassifier(n_estimators=100), [0]),
        ("BOSSEnsemble3", BOSSEnsemble(max_ensemble_size=3), [3]),
    ]
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

KeyboardInterrupt: 

### RISE

In [203]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", RandomIntervalSpectralForest(random_state=42, n_jobs=-1)),
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8704761904761905

### Feature extraction with TSFRESH and random forest

In [215]:
clf = make_pipeline(
    TSFreshFeatureExtractor(show_warnings=False), 
    RandomForestClassifier(n_jobs=-1, min_samples_split=10, random_state=42, class_weight=dict(zip([1,2,3,4],(pd.Series(y).value_counts()/len(y)).values)))
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Feature Extraction: 100%|██████████| 5/5 [08:18<00:00, 99.78s/it] 
Feature Extraction: 100%|██████████| 5/5 [02:43<00:00, 32.63s/it]


0.9142857142857143

### Feature extraction with TSFRESH and catboost

In [95]:
# transformer = TSFreshFeatureExtractor(n_jobs=-1)
# extracted_features = transformer.fit_transform(X_train)
# extracted_features.head()

In [96]:
clf = make_pipeline(
    TSFreshFeatureExtractor(n_jobs=-1, show_warnings=False), 
    CatBoostClassifier(random_seed=42, 
                       verbose=False,)
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Feature Extraction: 100%|██████████| 70/70 [03:33<00:00,  3.05s/it]
Feature Extraction: 100%|██████████| 70/70 [01:13<00:00,  1.05s/it]


0.9676190476190476

### Feature extraction with TSFRESH and xgboost

In [96]:
clf = make_pipeline(
    TSFreshFeatureExtractor(n_jobs=-1, show_warnings=False), 
    XG(random_seed=42, 
                       verbose=False,)
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Feature Extraction: 100%|██████████| 70/70 [03:33<00:00,  3.05s/it]
Feature Extraction: 100%|██████████| 70/70 [01:13<00:00,  1.05s/it]


0.9676190476190476

### Feature extraction with TSFRESH and catboost

In [95]:
# transformer = TSFreshFeatureExtractor(n_jobs=-1)
# extracted_features = transformer.fit_transform(X_train)
# extracted_features.head()

In [96]:
clf = make_pipeline(
    TSFreshFeatureExtractor(n_jobs=-1, show_warnings=False), 
    CatBoostClassifier(random_seed=42, 
                       verbose=False,)
)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Feature Extraction: 100%|██████████| 70/70 [03:33<00:00,  3.05s/it]
Feature Extraction: 100%|██████████| 70/70 [01:13<00:00,  1.05s/it]


0.9676190476190476

## Prediction for test set

In [97]:
import os
files = []
for dirname, _, filenames in os.walk('../data/raw/second_version/data_test/data_test/'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

In [98]:
files.sort()

In [99]:
list_of_dfs = []
for file in files:
    list_of_cols = [pd.Series(col) for col in pd.read_csv(file).T.values]
    list_of_cols.append(pd.read_csv(file).T.values[3] / 
                        pd.read_csv(file).T.values[2])
    list_of_cols.append(pd.read_csv(file).T.values[3] / 
                        pd.read_csv(file).T.values[1])
    list_of_cols.append(pd.read_csv(file).T.values[3] / 
                        pd.read_csv(file).T.values[0])
    list_of_dfs.append(list_of_cols)

In [100]:
X_val = pd.DataFrame(list_of_dfs)

In [101]:
y_val = clf.predict(X_val)

Feature Extraction: 100%|██████████| 70/70 [01:41<00:00,  1.46s/it]


In [102]:
submission_df = pd.DataFrame({'id':[file.split('/')[-1] for file in files],
 'category':y_val[:,0]
}).set_index('id')

In [103]:
submission_df.head()

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
2_trans_1.csv,3
2_trans_10.csv,1
2_trans_1003.csv,1
2_trans_1004.csv,3
2_trans_1005.csv,1


In [104]:
submission_df.to_csv('../../submission10.csv')