In [1]:
import pandas as pd
import numpy as np
import os

from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.classification.interval_based import TimeSeriesForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import (plot_roc_curve, classification_report, roc_auc_score, average_precision_score,
confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve)

In [2]:
# pip install sktime --user

In [26]:
path = '/Users/shaow/Desktop/DS-GA1018/Project/'
X_train = pd.read_csv(os.path.join(path, "train_test_data/X_train.csv"))
X_test = pd.read_csv(os.path.join(path, "train_test_data/X_test.csv"))
y_train = pd.read_csv(os.path.join(path, "train_test_data/y_train.csv"))
y_test = pd.read_csv(os.path.join(path, "train_test_data/y_test.csv"))

X_train.drop('Unnamed: 0', axis = 1, inplace = True)
X_test.drop('Unnamed: 0', axis = 1, inplace = True)
y_train.drop('Unnamed: 0', axis = 1, inplace = True)
y_test.drop('Unnamed: 0', axis = 1, inplace = True)

In [27]:
L = 10
def CreateRainfullDataSet(X, y, L):
    input_size = X.shape[1]
    X_train_TimeSeries = np.empty([1, L, input_size])
    y_train_TimeSeries = np.empty([1, L])
    for j in range(16, 65):
        location_labels = np.where(X.iloc[: ,j] == 1)[0]
        training_index = []
        for i in location_labels[(L - 1):len(location_labels)]:
            training_index += np.arange(i- L + 1, i + 1).tolist()
            
        X_train_TimeSeries  = np.concatenate((X_train_TimeSeries, np.array(X.iloc[training_index, :].values).reshape(-1, L, input_size)), 0)
        y_train_TimeSeries = np.concatenate((y_train_TimeSeries, np.array(y.iloc[training_index].values).reshape(-1, L)), 0)
            
    return np.transpose(X_train_TimeSeries[1:, :], (0, 2, 1)), y_train_TimeSeries[1:]

X_train_TS, y_train_TS = CreateRainfullDataSet(X_train, y_train, L)
X_test_TS, y_test_TS = CreateRainfullDataSet(X_test, y_test, L)

In [47]:
X_train_TS.shape

(144509, 1)

In [48]:
len(y_train)

144509

In [28]:
y_train = np.zeros(len(y_train_TS))
y_test = np.zeros(len(y_test_TS))
for i in range(len(y_train_TS)):
    y_train[i] = int(y_train_TS[i][-1])
for i in range(len(y_test_TS)):
    y_test[i] = int(y_test_TS[i][-1])

In [31]:
clf1 = ColumnConcatenator()
clf1.fit(X_train_TS)
X_train_TS = clf1.transform(X_train_TS)

In [33]:
clf1.fit(X_test_TS)
X_test_TS = clf1.transform(X_test_TS)

In [34]:
X_test_TS

Unnamed: 0,0
0,0 10.6 1 12.5 2 13.1 3 ...
1,0 12.5 1 13.1 2 13.5 3 ...
2,0 13.1 1 13.5 2 27.0 3 ...
3,0 13.5 1 27.0 2 18.2 3 ...
4,0 27.0 1 18.2 2 13.6 3 ...
...,...
47896,0 9.6 1 4.4 2 4.9 3 8....
47897,0 4.4 1 4.9 2 8.2 3 3....
47898,0 4.9 1 8.2 2 3.9 3 6....
47899,0 8.2 1 3.9 2 6.1 3 7....


**n_estimators=10**

In [42]:
clf = TimeSeriesForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train_TS, y_train)

TimeSeriesForestClassifier(n_estimators=10, random_state=42)

In [43]:
y_pred = clf.predict(X_test_TS)
#accuracy_score(y_test_TS, y_pred)

In [44]:
accuracy_score(y_test, y_pred)

0.7878541157804638

In [45]:
y_pred_prob = clf.predict_proba(X_test_TS)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

              precision    recall  f1-score   support

         0.0       0.81      0.96      0.88     37763
         1.0       0.50      0.15      0.23     10138

    accuracy                           0.79     47901
   macro avg       0.65      0.55      0.55     47901
weighted avg       0.74      0.79      0.74     47901

0.6789902449760291


In [46]:
y_pred_train = clf.predict(X_train_TS)
accuracy_score(y_train, y_pred_train)

1.0

In [37]:
clf = TimeSeriesForestClassifier(n_estimators=10, min_interval = 10, random_state=42)
clf.fit(X_train_TS, y_train)
y_pred = clf.predict(X_test_TS)
accuracy_score(y_test, y_pred)

0.7878958685622429

In [38]:
y_pred_prob = clf.predict_proba(X_test_TS)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

              precision    recall  f1-score   support

         0.0       0.81      0.96      0.88     37763
         1.0       0.50      0.15      0.23     10138

    accuracy                           0.79     47901
   macro avg       0.65      0.55      0.55     47901
weighted avg       0.74      0.79      0.74     47901

0.6824511203329074


**n_estimators=20**

In [55]:
clf = TimeSeriesForestClassifier(n_estimators=20, random_state=42)
clf.fit(X_train_TS, y_train)
y_pred = clf.predict(X_test_TS)
y_pred_prob = clf.predict_proba(X_test_TS)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))S

              precision    recall  f1-score   support

         0.0       0.81      0.97      0.88     37763
         1.0       0.56      0.15      0.23     10138

    accuracy                           0.80     47901
   macro avg       0.69      0.56      0.56     47901
weighted avg       0.76      0.80      0.74     47901

0.7222496693368715


In [56]:
accuracy_score(y_test, y_pred)

0.795432245673368

In [57]:
clf = TimeSeriesForestClassifier(n_estimators=30, random_state=42)
clf.fit(X_train_TS, y_train)
y_pred = clf.predict(X_test_TS)
y_pred_prob = clf.predict_proba(X_test_TS)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

              precision    recall  f1-score   support

         0.0       0.81      0.97      0.88     37763
         1.0       0.61      0.14      0.23     10138

    accuracy                           0.80     47901
   macro avg       0.71      0.56      0.56     47901
weighted avg       0.77      0.80      0.75     47901

0.7342156290486261


In [58]:
accuracy_score(y_test, y_pred)

0.7988768501701425

**n_estimators=200**

In [35]:
clf = TimeSeriesForestClassifier(n_estimators=200, n_jobs = -1, random_state=42)
clf.fit(X_train_TS, y_train)
y_pred = clf.predict(X_test_TS)
y_pred_prob = clf.predict_proba(X_test_TS)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

              precision    recall  f1-score   support

         0.0       0.81      0.98      0.89     37763
         1.0       0.67      0.13      0.21     10138

    accuracy                           0.80     47901
   macro avg       0.74      0.56      0.55     47901
weighted avg       0.78      0.80      0.74     47901

0.759898088475273


In [36]:
accuracy_score(y_test, y_pred)

0.8019456796309055

**n_estimators=400**

In [39]:
clf = TimeSeriesForestClassifier(n_estimators=400, n_jobs = -1, random_state=42)
clf.fit(X_train_TS, y_train)
y_pred = clf.predict(X_test_TS)
y_pred_prob = clf.predict_proba(X_test_TS)[:,1]
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

              precision    recall  f1-score   support

         0.0       0.81      0.98      0.89     37763
         1.0       0.67      0.13      0.22     10138

    accuracy                           0.80     47901
   macro avg       0.74      0.56      0.56     47901
weighted avg       0.78      0.80      0.75     47901

0.7678522604721945


In [40]:
accuracy_score(y_test, y_pred)

0.8029477463936034

In [41]:
y_pred_train = clf.predict(X_train_TS)
accuracy_score(y_train, y_pred_train)

1.0