In [1]:
!pip install synthcity 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install sdv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from synthcity.plugins import Plugins
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
from sklearn.model_selection import train_test_split

FEATURE_COLUMNS = [
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]', 
    'Torque [Nm]',
    'Tool wear [min]'
]

LABEL_A = "Target"
LABEL_B = "Failure Type"

FEATURES_AND_LABEL = [
    'Type',
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]', 
    'Torque [Nm]',
    'Tool wear [min]',
    'Target'

]

K_FOLDS = 5
TEST_SPLIT_SIZE = 0.25

PERCENTAGE_ADDED_SYNTHETIC_DATA = 0.5 # 1 = +100% positive samples

GENERATE_KWARGS = dict(
    uniformization_ratio=0,
    stretch_factor=1
)

SEEDS = (10,)

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)



# def get_synthia_independence_copula(X_train, y_train):
#     generator = syn.CopulaDataGenerator(verbose=False)
#     mask = (y_train == 1).ravel() # mask for positive samples
#     generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.IndependenceCopula())
#     return generator,mask

# def get_synthia_gaussian_copula(X_train, y_train):
#     generator = syn.CopulaDataGenerator(verbose=False)
#     mask = (y_train == 1).ravel() # mask for positive samples
#     parameterizer = syn.QuantileParameterizer(n_quantiles=100)
        
#     generator.fit(np.hstack((X_train[mask], y_train[mask].reshape(-1, 1))), copula=syn.GaussianCopula(), parameterize_by=parameterizer)
#     return generator,mask

In [4]:
df = (
    pd.read_csv("/content/predictive_maintenance.csv", )
    .assign(
        Type=lambda x: LabelEncoder().fit_transform(x["Type"])
    )
)
df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,2,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,1,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,1,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,1,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,1,298.2,308.7,1408,40.0,9,0,No Failure


# Copula with SDV

In [5]:
df = df.loc[:, FEATURES_AND_LABEL]
df.head()


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,2,298.1,308.6,1551,42.8,0,0
1,1,298.2,308.7,1408,46.3,3,0
2,1,298.1,308.5,1498,49.4,5,0
3,1,298.2,308.6,1433,39.5,7,0
4,1,298.2,308.7,1408,40.0,9,0


## Gaussian Copula


In [6]:

#chosing data to fit
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, FEATURE_COLUMNS], df.loc[:, LABEL_A],
    test_size=0.25, random_state=20
)
train_indexes = X_train.index

df_gen_positives = df.loc[train_indexes, :]
df_gen_positives = df_gen_positives[df[LABEL_A] == 1]
#fitting the synthesizer
#Creating GaussianCopulaSynthesizer object
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_gen_positives)
Gaussian = GaussianCopulaSynthesizer(metadata)
Gaussian.fit(df_gen_positives)
generated_data = Gaussian.sample(num_rows = len(df_gen_positives))

#training RandomForest
X_train_syn = np.vstack((X_train, generated_data.loc[:, FEATURE_COLUMNS]))
y_train_syn = np.hstack((y_train, generated_data.loc[:, LABEL_A]))
clf_syn = RandomForestClassifier(random_state=20)
clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
# Evaluate classifier on balanced data
y_pred_syn = clf_syn.predict(X_test)
clf_report_syn = classification_report(y_test, y_pred_syn)
clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")


In [7]:
print(one_metrics_syn)

(0.9090909090909091, 0.6097560975609756, 0.7299270072992701, None)


## CTGAN

In [8]:

#chosing data to fit
X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, FEATURE_COLUMNS], df.loc[:, LABEL_A],
    test_size=0.25, random_state=20
)
train_indexes = X_train.index

df_gen_positives = df.loc[train_indexes, :]
df_gen_positives = df_gen_positives[df[LABEL_A] == 1]

#Creating CTGANSynthesizer object
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_gen_positives)
CTGAN = CTGANSynthesizer(metadata)
#fitting the synthesizer
CTGAN.fit(df_gen_positives)
generated_data = CTGAN.sample(num_rows = len(df_gen_positives))

#training RandomForest
X_train_syn = np.vstack((X_train, generated_data.loc[:, FEATURE_COLUMNS]))
y_train_syn = np.hstack((y_train, generated_data.loc[:, LABEL_A]))
clf_syn = RandomForestClassifier(random_state=20)
clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
# Evaluate classifier on balanced data
y_pred_syn = clf_syn.predict(X_test)
clf_report_syn = classification_report(y_test, y_pred_syn)
clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
print(one_metrics_syn)

(0.8620689655172413, 0.6097560975609756, 0.7142857142857144, None)


# Kfold with Gaussian Copule

In [9]:
SEEDS = [42, 30, 2, 1500]

In [23]:
results = []

for seed in SEEDS:
    set_seed(seed)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=seed)
    
    for train_index, test_index in skf.split(df.loc[:, FEATURE_COLUMNS], 
                                             df.loc[:, LABEL_A]):

        

        df_gen = df.loc[train_index]
        df_gen_positives = df_gen[df_gen[LABEL_A] == 1]
        metadata = SingleTableMetadata()
        metadata.detect_from_dataframe(data=df_gen_positives)
        Gaussian = GaussianCopulaSynthesizer(metadata)
        Gaussian.fit(df_gen_positives)
        generated_data = Gaussian.sample(num_rows = len(df_gen_positives))
        


        X_train, X_test = df.loc[train_index, FEATURE_COLUMNS], df.loc[test_index, FEATURE_COLUMNS]
        y_train, y_test = df.loc[train_index, LABEL_A], df.loc[test_index, LABEL_A]

    # Fit classifier on imbalanced data
        clf = RandomForestClassifier(random_state=seed)
        clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
        y_pred = clf.predict(X_test)
        clf_report = classification_report(y_test, y_pred)
        clf_conf_matrix = confusion_matrix(y_test, y_pred)
        one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
        results.append(["normal"] + list(one_metrics_normal) + [seed])

    # Fit classifier on balanced data
        X_train_syn = np.vstack((X_train, generated_data.loc[:, FEATURE_COLUMNS]))
        y_train_syn = np.hstack((y_train, generated_data.loc[:, LABEL_A]))
        clf_syn = RandomForestClassifier(random_state=seed)
        clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
        y_pred_syn = clf_syn.predict(X_test)
        clf_report_syn = classification_report(y_test, y_pred_syn)
        clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
        one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
        results.append(["synthetic"] + list(one_metrics_syn)+ [seed])

df_results_synthia_ind_g = pd.DataFrame(results, columns=["dataset_type","precision", "recall", "f1-score", "support", "seed"]).drop(columns=["support"])

In [24]:
df_results_synthia_ind_g.groupby(["dataset_type", "seed"]).mean().reset_index().groupby(["dataset_type", "seed"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score
dataset_type,seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,2,0.895433,0.584197,0.705248
normal,30,0.880226,0.619535,0.725086
normal,42,0.877407,0.578183,0.694386
normal,1500,0.891463,0.610536,0.723764
synthetic,2,0.836207,0.631343,0.718602
synthetic,30,0.843517,0.625285,0.715374
synthetic,42,0.847813,0.628358,0.719685
synthetic,1500,0.805655,0.672432,0.731153


# Kfold with CTGANs

In [12]:
results = []

for seed in SEEDS:
    set_seed(seed)
    skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=seed)
    
    for train_index, test_index in skf.split(df.loc[:, FEATURE_COLUMNS], 
                                             df.loc[:, LABEL_A]):

        

        df_gen = df.loc[train_index]
        df_gen_positives = df_gen[df_gen[LABEL_A] == 1]
        metadata = SingleTableMetadata()
        metadata.detect_from_dataframe(data=df_gen_positives)
        CTGAN = CTGANSynthesizer(metadata)
        CTGAN.fit(df_gen_positives)
        generated_data = CTGAN.sample(num_rows = len(df_gen_positives))
        


        X_train, X_test = df.loc[train_index, FEATURE_COLUMNS], df.loc[test_index, FEATURE_COLUMNS]
        y_train, y_test = df.loc[train_index, LABEL_A], df.loc[test_index, LABEL_A]

    # Fit classifier on imbalanced data
        clf = RandomForestClassifier(random_state=seed)
        clf = clf.fit(X_train, y_train)
    # Evaluate classifier on imbalanced data
        y_pred = clf.predict(X_test)
        clf_report = classification_report(y_test, y_pred)
        clf_conf_matrix = confusion_matrix(y_test, y_pred)
        one_metrics_normal = precision_recall_fscore_support(y_test, y_pred, average="binary")
        results.append(["normal"] + list(one_metrics_normal) + [seed])

    # Fit classifier on balanced data
        X_train_syn = np.vstack((X_train, generated_data.loc[:, FEATURE_COLUMNS]))
        y_train_syn = np.hstack((y_train, generated_data.loc[:, LABEL_A]))
        clf_syn = RandomForestClassifier(random_state=seed)
        clf_syn = clf_syn.fit(X_train_syn, y_train_syn)
    # Evaluate classifier on balanced data
        y_pred_syn = clf_syn.predict(X_test)
        clf_report_syn = classification_report(y_test, y_pred_syn)
        clf_conf_matrix_syn = confusion_matrix(y_test, y_pred_syn)
        one_metrics_syn = precision_recall_fscore_support(y_test, y_pred_syn, average="binary")
        results.append(["synthetic"] + list(one_metrics_syn)+ [seed])

df_results_synthia_ind = pd.DataFrame(results, columns=["dataset_type","precision", "recall", "f1-score", "support", "seed"]).drop(columns=["support"])

In [15]:
df_results_synthia_ind.groupby(["dataset_type", "seed"]).mean().reset_index().groupby(["dataset_type", "seed"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score
dataset_type,seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,2,0.895433,0.584197,0.705248
normal,30,0.880226,0.619535,0.725086
normal,42,0.877407,0.578183,0.694386
normal,1500,0.891463,0.610536,0.723764
synthetic,2,0.851023,0.61655,0.71446
synthetic,30,0.80554,0.657814,0.722257
synthetic,42,0.863808,0.634241,0.728064
synthetic,1500,0.864425,0.678402,0.758964
