In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from synthcity.plugins import Plugins

FEATURE_COLUMNS = [
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]', 
    'Torque [Nm]',
    'Tool wear [min]'
]

LABEL_A = "Target"
LABEL_B = "Failure Type"

FEATURES_AND_LABEL = [
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]', 
    'Torque [Nm]',
    'Tool wear [min]',
    'Target'

]

K_FOLDS = 5
TEST_SPLIT_SIZE = 0.25

PERCENTAGE_ADDED_SYNTHETIC_DATA = 0.5 # 1 = +100% positive samples

GENERATE_KWARGS = dict(
    uniformization_ratio=0,
    stretch_factor=1
)

SEEDS = (10,)

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# def get_synthia_independence_copula(X_train, y_train):
#     generator = syn.CopulaDataGenerator(verbose=False)
#     mask = (y_train == 1).ravel() # mask for positive samples
#     generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.IndependenceCopula())
#     return generator,mask

# def get_synthia_gaussian_copula(X_train, y_train):
#     generator = syn.CopulaDataGenerator(verbose=False)
#     mask = (y_train == 1).ravel() # mask for positive samples
#     parameterizer = syn.QuantileParameterizer(n_quantiles=100)
        
#     generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.GaussianCopula(), parameterize_by=parameterizer)
#     return generator,mask

In [52]:
df = (
    pd.read_csv("../data/raw/predictive_maintenance.csv")
    .assign(
        Type=lambda x: LabelEncoder().fit_transform(x["Type"])
    )
)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,2,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,1,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,1,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,1,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,1,298.2,308.7,1408,40.0,9,0,No Failure


In [53]:
df = df.loc[:, FEATURES_AND_LABEL]
df.head()


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,2,298.1,308.6,1551,42.8,0,0
1,1,298.2,308.7,1408,46.3,3,0
2,1,298.1,308.5,1498,49.4,5,0
3,1,298.2,308.6,1433,39.5,7,0
4,1,298.2,308.7,1408,40.0,9,0


### Problem with label="Target"

In [56]:
from sklearn.model_selection import train_test_split
syn_model = Plugins().get("ctgan")

X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, FEATURE_COLUMNS], df.loc[:, LABEL_A],
    test_size=0.25, random_state=20
)
train_indexes = X_train.index

df_gen_positives = df.loc[train_indexes, :]
df_gen_positives = df_gen_positives[df[LABEL_A] == 1]

syn_model.fit(df_gen_positives)
generated_data = syn_model.generate(count = len(df_gen_positives)).dataframe()


X_train_syn = np.vstack((X_train, generated_data.loc[:, FEATURE_COLUMNS]))
y_train_syn = np.hstack((y_train, generated_data.loc[:, LABEL_A]))
clf_syn = RandomForestClassifier(random_state=20)
clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
# Evaluate classifier on balanced data
y_pred_syn = clf_syn.predict(X_test)
clf_report_syn = classification_report(y_test, y_pred_syn)
clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")


 42%|████▏     | 849/2000 [05:15<07:07,  2.69it/s]


In [57]:
one_metrics_syn

(0.7213114754098361, 0.5365853658536586, 0.6153846153846154, None)

In [61]:
results = []
SEEDS = [42]
for seed in SEEDS:
    set_seed(seed)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=seed)
    
    for train_index, test_index in skf.split(df.loc[:, FEATURE_COLUMNS], 
                                             df.loc[:, LABEL_A]):

        syn_model = Plugins().get("ctgan")

        df_gen = df.loc[train_index]
        df_gen_positives = df_gen[df_gen[LABEL_A] == 1]

        syn_model.fit(df_gen_positives)
        generated_data = syn_model.generate(count = len(df_gen_positives)).dataframe()
        
        


        X_train, X_test = df.loc[train_index, FEATURE_COLUMNS], df.loc[test_index, FEATURE_COLUMNS]
        y_train, y_test = df.loc[train_index, LABEL_A], df.loc[test_index, LABEL_A]

    # Fit classifier on imbalanced data
        clf = RandomForestClassifier(random_state=seed)
        clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
        y_pred = clf.predict(X_test)
        clf_report = classification_report(y_test, y_pred)
        clf_conf_matrix = confusion_matrix(y_test, y_pred)
        one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
        results.append(["normal"] + list(one_metrics_normal) + [seed])

    # Fit classifier on balanced data
        X_train_syn = np.vstack((X_train, generated_data.loc[:, FEATURE_COLUMNS]))
        y_train_syn = np.hstack((y_train, generated_data.loc[:, LABEL_A]))
        clf_syn = RandomForestClassifier(random_state=seed)
        clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
        y_pred_syn = clf_syn.predict(X_test)
        clf_report_syn = classification_report(y_test, y_pred_syn)
        clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
        one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
        results.append(["synthetic"] + list(one_metrics_syn)+ [seed])

df_results_synthia_ind = pd.DataFrame(results, columns=["dataset_type","precision", "recall", "f1-score", "support", "seed"]).drop(columns=["support"])

 27%|██▋       | 549/2000 [03:44<09:52,  2.45it/s]
 22%|██▏       | 449/2000 [02:37<09:03,  2.85it/s]
 30%|██▉       | 599/2000 [03:32<08:16,  2.82it/s]
 17%|█▋        | 349/2000 [02:00<09:30,  2.90it/s]
 22%|██▏       | 449/2000 [02:49<09:45,  2.65it/s]


In [62]:
df_results_synthia_ind.groupby(["dataset_type", "seed"]).mean().reset_index().groupby("dataset_type").mean()

Unnamed: 0_level_0,seed,precision,recall,f1-score
dataset_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,42.0,0.877407,0.578183,0.694386
synthetic,42.0,0.840951,0.657814,0.732997


In [45]:
results = []
for seed in SEEDS:
    set_seed(seed)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=seed)

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # fit generator on positive samples
        generator, mask = get_synthia_gaussian_copula(X_train, y_train)

    # generate positive samples
        n_samples = int(mask.sum() * PERCENTAGE_ADDED_SYNTHETIC_DATA)
        generated_samples = generator.generate(n_samples=n_samples, seed=seed, **GENERATE_KWARGS)

    # Fit classifier on imbalanced data
        clf = RandomForestClassifier(random_state=seed)
        clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
        y_pred = clf.predict(X_test)
        clf_report = classification_report(y_test, y_pred)
        clf_conf_matrix = confusion_matrix(y_test, y_pred)
        one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
        results.append(["normal"] + list(one_metrics_normal) + [seed])

    # Fit classifier on balanced data
        X_train_syn = np.vstack((X_train, generated_samples[:, :-1]))
        y_train_syn = np.hstack((y_train, generated_samples[:, -1]))
        clf_syn = RandomForestClassifier(random_state=seed)
        clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
        y_pred_syn = clf_syn.predict(X_test)
        clf_report_syn = classification_report(y_test, y_pred_syn)
        clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
        one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
        results.append(["synthetic"] + list(one_metrics_syn)+ [seed])

df_results_synthia_gaussian = pd.DataFrame(results, columns=["dataset_type","precision", "recall", "f1-score", "support", "seed"]).drop(columns=["support"])

In [46]:
df_results_synthia_gaussian.groupby(["dataset_type", "seed"]).mean().reset_index().groupby("dataset_type").mean()

Unnamed: 0_level_0,seed,precision,recall,f1-score
dataset_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,30.0,0.887472,0.614864,0.723908
synthetic,30.0,0.828406,0.658455,0.732019
