In [22]:
import pandas as pd
import synthia as syn
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import synthia as syn
import pyvinecopulib as pv

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

FEATURE_COLUMNS = [
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]', 
    'Torque [Nm]',
    'Tool wear [min]'
]
LABEL_A = "Target"
LABEL_B = "Failure Type"

K_FOLDS = 5
TEST_SPLIT_SIZE = 0.25

PERCENTAGE_ADDED_SYNTHETIC_DATA = 0.5 # 1 = +100% positive samples

GENERATE_KWARGS = dict(
    uniformization_ratio=0,
    stretch_factor=1
)

SEED = 42

In [23]:
df = (
    pd.read_csv("../data/raw/predictive_maintenance.csv")
    .assign(
        Type=lambda x: LabelEncoder().fit_transform(x["Type"])
    )
)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,2,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,1,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,1,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,1,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,1,298.2,308.7,1408,40.0,9,0,No Failure


In [24]:
df["Target"].value_counts()

0    9661
1     339
Name: Target, dtype: int64

### Problem with label="Target"

In [25]:
X = df.loc[:, FEATURE_COLUMNS].values
y = df.loc[:, LABEL_A].values

In [26]:
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)

metrics_normal = []
metrics_syn = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # fit generator on positive samples
    generator = syn.CopulaDataGenerator(verbose=False)
    mask = (y_train == 1).ravel() # mask for positive samples
    generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.IndependenceCopula())
    
    # generate positive samples
    n_samples = int(mask.sum() * PERCENTAGE_ADDED_SYNTHETIC_DATA)
    generated_samples = generator.generate(n_samples=n_samples, **GENERATE_KWARGS)
    
    # Fit classifier on imbalanced data
    clf = RandomForestClassifier()
    clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
    y_pred = clf.predict(X_test)
    clf_report = classification_report(y_test, y_pred)
    clf_conf_matrix = confusion_matrix(y_test, y_pred)
    one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
    metrics_normal.append(one_metrics_normal)
    
    # Fit classifier on balanced data
    X_train_syn = np.vstack((X_train, generated_samples[:, :-1]))
    y_train_syn = np.hstack((y_train, generated_samples[:, -1]))
    clf_syn = RandomForestClassifier()
    clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
    y_pred_syn = clf_syn.predict(X_test)
    clf_report_syn = classification_report(y_test, y_pred_syn)
    clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
    one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
    metrics_syn.append(one_metrics_syn)
    
    # Print metrics
    # print(clf_report)
    # print(clf_report_syn)
    
    # print(clf_conf_matrix)
    # print(clf_conf_matrix_syn)
    

In [27]:
pd.DataFrame(metrics_normal, columns=["precision", "recall", "f1-score", "support"]).drop(columns=["support"]).mean()

precision    0.888307
recall       0.595874
f1-score     0.711059
dtype: float64

In [28]:
pd.DataFrame(metrics_syn, columns=["precision", "recall", "f1-score", "support"]).drop(columns=["support"]).mean()

precision    0.839156
recall       0.654829
f1-score     0.733380
dtype: float64