In [35]:
import pandas as pd
import synthia as syn
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import synthia as syn
import pyvinecopulib as pv
import random
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

FEATURE_COLUMNS = [
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]', 
    'Torque [Nm]',
    'Tool wear [min]'
]
LABEL_A = "Target"
LABEL_B = "Failure Type"

K_FOLDS = 5
TEST_SPLIT_SIZE = 0.25

PERCENTAGE_ADDED_SYNTHETIC_DATA = 0.5 # 1 = +100% positive samples

GENERATE_KWARGS = dict(
    uniformization_ratio=0,
    stretch_factor=1
)

SEEDS = (10, 20, 30, 40, 50)

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
def get_synthia_independence_copula(X_train, y_train):
    generator = syn.CopulaDataGenerator(verbose=False)
    mask = (y_train == 1).ravel() # mask for positive samples
    generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.IndependenceCopula())
    return generator,mask

def get_synthia_gaussian_copula(X_train, y_train):
    generator = syn.CopulaDataGenerator(verbose=False)
    mask = (y_train == 1).ravel() # mask for positive samples
    parameterizer = syn.QuantileParameterizer(n_quantiles=100)
        
    generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.GaussianCopula(), parameterize_by=parameterizer)
    return generator,mask

In [9]:
df = (
    pd.read_csv("../data/raw/predictive_maintenance.csv")
    .assign(
        Type=lambda x: LabelEncoder().fit_transform(x["Type"])
    )
)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,2,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,1,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,1,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,1,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,1,298.2,308.7,1408,40.0,9,0,No Failure


In [10]:
df["Target"].value_counts()

0    9661
1     339
Name: Target, dtype: int64

### Problem with label="Target"

In [11]:
X = df.loc[:, FEATURE_COLUMNS].values
y = df.loc[:, LABEL_A].values

In [43]:
results = []
for seed in SEEDS:
    set_seed(seed)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=seed)
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # fit generator on positive samples
        generator, mask = get_synthia_independence_copula(X_train, y_train)

    # generate positive samples
        n_samples = int(mask.sum() * PERCENTAGE_ADDED_SYNTHETIC_DATA)
        generated_samples = generator.generate(n_samples=n_samples, seed=seed, **GENERATE_KWARGS)

    # Fit classifier on imbalanced data
        clf = RandomForestClassifier(random_state=seed)
        clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
        y_pred = clf.predict(X_test)
        clf_report = classification_report(y_test, y_pred)
        clf_conf_matrix = confusion_matrix(y_test, y_pred)
        one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
        results.append(["normal"] + list(one_metrics_normal) + [seed])

    # Fit classifier on balanced data
        X_train_syn = np.vstack((X_train, generated_samples[:, :-1]))
        y_train_syn = np.hstack((y_train, generated_samples[:, -1]))
        clf_syn = RandomForestClassifier(random_state=seed)
        clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
        y_pred_syn = clf_syn.predict(X_test)
        clf_report_syn = classification_report(y_test, y_pred_syn)
        clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
        one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
        results.append(["synthetic"] + list(one_metrics_syn)+ [seed])

df_results_synthia_ind = pd.DataFrame(results, columns=["dataset_type","precision", "recall", "f1-score", "support", "seed"]).drop(columns=["support"])

In [44]:
df_results_synthia_ind.groupby(["dataset_type", "seed"]).mean().reset_index().groupby("dataset_type").mean()

Unnamed: 0_level_0,seed,precision,recall,f1-score
dataset_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,30.0,0.887472,0.614864,0.723908
synthetic,30.0,0.845157,0.652537,0.734546


In [45]:
results = []
for seed in SEEDS:
    set_seed(seed)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=seed)

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # fit generator on positive samples
        generator, mask = get_synthia_gaussian_copula(X_train, y_train)

    # generate positive samples
        n_samples = int(mask.sum() * PERCENTAGE_ADDED_SYNTHETIC_DATA)
        generated_samples = generator.generate(n_samples=n_samples, seed=seed, **GENERATE_KWARGS)

    # Fit classifier on imbalanced data
        clf = RandomForestClassifier(random_state=seed)
        clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
        y_pred = clf.predict(X_test)
        clf_report = classification_report(y_test, y_pred)
        clf_conf_matrix = confusion_matrix(y_test, y_pred)
        one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
        results.append(["normal"] + list(one_metrics_normal) + [seed])

    # Fit classifier on balanced data
        X_train_syn = np.vstack((X_train, generated_samples[:, :-1]))
        y_train_syn = np.hstack((y_train, generated_samples[:, -1]))
        clf_syn = RandomForestClassifier(random_state=seed)
        clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
        y_pred_syn = clf_syn.predict(X_test)
        clf_report_syn = classification_report(y_test, y_pred_syn)
        clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
        one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
        results.append(["synthetic"] + list(one_metrics_syn)+ [seed])

df_results_synthia_gaussian = pd.DataFrame(results, columns=["dataset_type","precision", "recall", "f1-score", "support", "seed"]).drop(columns=["support"])

In [46]:
df_results_synthia_gaussian.groupby(["dataset_type", "seed"]).mean().reset_index().groupby("dataset_type").mean()

Unnamed: 0_level_0,seed,precision,recall,f1-score
dataset_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,30.0,0.887472,0.614864,0.723908
synthetic,30.0,0.828406,0.658455,0.732019
