In [None]:
import pandas as pd
import os
from sklearn import preprocessing 
import glob
import matplotlib
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import tensorflow as tf
from tensorflow import keras
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
COLUMNS = ['AB', 'CS', 'EP', 'G', 'LC', 'P', 'VBAC', 'Age', 'amniotic_fluid', 'Position', 'weight', 'week', 'induced', 'target']
NUM_COLUMNS = ['AB', 'CS', 'EP', 'G', 'LC', 'P', 'VBAC', 'Age', 'weight', 'week']
CAT_COLUMNS = ['amniotic_fluid', 'Position', 'induced']
EPOCHS = 100
BATCH_SIZE = 2048

In [None]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
path = os.getcwd()
print(path)
dataset_path = path + "/dataset"
# dataset_path = path + "\\dataset"
print(dataset_path)
file_list = glob.glob(dataset_path + "/*.csv")
print(file_list)

In [None]:
cols = []
seis = []
outp = pd.DataFrame()
logd = pd.DataFrame()
for file in file_list:
    df = pd.read_csv(file)
    df.fillna(0, inplace=True)
    df = convetr_to_num(df)
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_df, val_df = train_test_split(train_df, test_size=0.2)
    # Form np arrays of labels and features.
    train_labels = np.array(train_df.pop('target'))
    bool_train_labels = train_labels != 0
    val_labels = np.array(val_df.pop('target'))
    test_labels = np.array(test_df.pop('target'))

    train_features = np.array(train_df)
    val_features = np.array(val_df)
    test_features = np.array(test_df)
    
    early_stopping = tf.keras.callbacks.EarlyStopping(
                                                monitor='val_auc', 
                                                verbose=1,
                                                patience=10,
                                                mode='max',
                                                restore_best_weights=True)
#     model = make_model_mlp()
    model = make_model()
    
    baseline_history = model.fit(train_features,
                                 train_labels,
                                 batch_size=BATCH_SIZE,
                                 epochs=EPOCHS,
                                 callbacks = [early_stopping],
                                 validation_data=(val_features, val_labels))
    
    plot_metrics(baseline_history, file[:-4])
    
    train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE)
    test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE)
    
    plot_cm(test_labels, test_predictions_baseline, file[:-4] + "cm")

    baseline_results = model.evaluate(test_features, test_labels, batch_size=BATCH_SIZE, verbose=0)
    logdtemp = pd.DataFrame(baseline_results, columns=[file.split('/')[-1][:-4]])
    logd = pd.concat([logd, logdtemp.T])
logd.to_csv("summary.csv")

In [None]:
file = file_list[0]
file.split('/')[-1][:-4]

In [None]:
# Form np arrays of labels and features.
train_labels = np.array(train_df.pop('target'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('target'))
test_labels = np.array(test_df.pop('target'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

In [None]:
def make_model_mlp(metrics = METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
        
    model = keras.Sequential([
        keras.layers.Dense( 32, activation='relu',input_shape=(train_features.shape[-1],)),
        keras.layers.Dense( 64, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics)
    return model

In [None]:
def make_model(metrics = METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    model = keras.Sequential([
        keras.layers.Dense(t32, activation='relu',input_shape=(train_features.shape[-1],)),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.BinaryCrossentropy(),metrics=metrics)
    return model

In [None]:
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)

sns.jointplot(pos_df['CS'], pos_df['P'],
              kind='hex', xlim = (-5,5), ylim = (-5,5))
plt.suptitle("Positive distribution")

sns.jointplot(neg_df['CS'], neg_df['P'],
              kind='hex', xlim = (-5,5), ylim = (-5,5))
_ = plt.suptitle("Negative distribution")

In [None]:
neg, pos = np.bincount(df['target'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
initial_bias = np.log([pos/neg])
initial_bias

In [None]:
model = make_model(output_bias = initial_bias)
model.predict(train_features[:10])

In [None]:
def plot_loss(history, label, n):
  # Use a log scale to show the wide range of values.
    plt.semilogy(history.epoch,  history.history['loss'],
               color=colors[n], label='Train '+label)
    plt.semilogy(history.epoch,  history.history['val_loss'],
          color=colors[n], label='Val '+label,
          linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.legend()

In [None]:
zero_bias_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_data=(val_features, val_labels), 
    verbose=0)

In [None]:
careful_bias_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=20,
    validation_data=(val_features, val_labels), 
    verbose=0)

In [None]:
plot_loss(zero_bias_history, "Zero Bias", 0)
plot_loss(careful_bias_history, "Careful Bias", 1)

In [None]:
initial_weights = os.path.join(tempfile.mkdtemp(),'initial_weights')
model.save_weights(initial_weights)

In [None]:
model = make_model()
model.load_weights(initial_weights)
baseline_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks = [early_stopping],
    validation_data=(val_features, val_labels))

In [None]:
def plot_metrics(history, file):
    metrics =  ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

        plt.legend()
    plt.savefig(file + '.png')
    plt.close()

In [None]:
plot_metrics(baseline_history)

In [None]:
train_predictions_baseline = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_baseline = model.predict(test_features, batch_size=BATCH_SIZE)

In [None]:
def plot_cm(labels, predictions, file, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.savefig(file + '.png')
    plt.close()
#     print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
#     print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
#     print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
#     print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
#     print('Total Fraudulent Transactions: ', np.sum(cm[1]))

In [None]:
baseline_results = model.evaluate(test_features, test_labels,
                                  batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, baseline_results):
    print(name, ': ', value)
print()

plot_cm(test_labels, test_predictions_baseline)

In [None]:
def plot_roc(name, labels, predictions,file, **kwargs):
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

    plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
    plt.xlabel('False positives [%]')
    plt.ylabel('True positives [%]')
    plt.xlim([-0.5,20])
    plt.ylim([80,100.5])
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')
    plt.savefig(file + '.png')
    plt.close()

In [None]:
plot_roc("Train Baseline", train_labels, train_predictions_baseline, color=colors[0])
plot_roc("Test Baseline", test_labels, test_predictions_baseline, color=colors[0], linestyle='--')
plt.legend(loc='lower right')

In [None]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
weighted_model = make_model()
weighted_model.load_weights(initial_weights)

weighted_history = weighted_model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks = [early_stopping],
    validation_data=(val_features, val_labels),
    # The class weights go here
    class_weight=class_weight) 

In [None]:
train_predictions_weighted = weighted_model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions_weighted = weighted_model.predict(test_features, batch_size=BATCH_SIZE)

In [None]:
weighted_results = weighted_model.evaluate(test_features, test_labels,
                                           batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(weighted_model.metrics_names, weighted_results):
    print(name, ': ', value)
print()

plot_cm(test_labels, test_predictions_weighted)

In [None]:
plot_roc("Train Baseline", train_labels, train_predictions_baseline, color=colors[0])
plot_roc("Test Baseline", test_labels, test_predictions_baseline, color=colors[0], linestyle='--')

plot_roc("Train Weighted", train_labels, train_predictions_weighted, color=colors[1])
plot_roc("Test Weighted", test_labels, test_predictions_weighted, color=colors[1], linestyle='--')


plt.legend(loc='lower right')

In [None]:
def generate_dataset(raw_df: pd.DataFrame, dataset_guide: pd.DataFrame):
    for ind, data_num, one_hot, bucket, scal, syn, resam in dataset_guide.itertuples():
        filen = "\\data"
        df_path = dataset_path + "\\data"
        df = raw_df.copy()
#         df_y = df.pop('target')
        if bucket:
            df = df_to_bucket(df)
            df_path += "_bucket"
            filen += "_bucket"
        if scal:
            df = df_to_scal(df)
            df_path += "_scal"
            filen += "_scal"
        if syn:
            df = df_to_syn(df)
            df_path += "_syn"
            filen += "_syn"
        if resam:
            df = df_to_resam(df)
            df_path += "_resam"
            filen += "_resam"
        if one_hot:
            df = df_to_one_hot(df, bucket)
            df_path += "_one_hot"
            filen += "_one_hot"
        os.mkdir(df_path)
#         df['target'] = df_y
        df.to_csv(df_path + "\\data.csv", index=False)
        df.to_csv(dataset_path + filen + ".csv", index=False)
#         df.
            

In [None]:
def df_to_one_hot(df, bucket):
    df_y = df.pop('target')
    ONE_HOT_COLS = ['amniotic_fluid', 'Position']
    columns_to_one_hot = ONE_HOT_COLS+NUM_COLUMNS if bucket else ONE_HOT_COLS 
    df_one_hot = pd.get_dummies(df[columns_to_one_hot].astype(str))
    df.drop(columns=columns_to_one_hot, inplace=True)
    df = pd.concat([df, df_one_hot], axis=1)
    df['target'] = df_y
    return df

In [None]:
def df_to_bucket(df, n_bins=5):
    df_y = df.pop('target')
    for col in NUM_COLUMNS:
        df[col] = pd.cut(df[col], bins=n_bins, labels=list(range(n_bins)))
    df['target'] = df_y
    return df

In [None]:
def df_to_scal(df):
    df_y = df.pop('target')
    scaler = preprocessing.MinMaxScaler()
    scaled_df = scaler.fit_transform(df[NUM_COLUMNS])
    df[NUM_COLUMNS] = pd.DataFrame(scaled_df, columns=NUM_COLUMNS)
    df['target'] = df_y
    return df

In [None]:
def df_to_syn(df):
    df_y = df.pop('target')
    for col in CAT_COLUMNS:
        uniam = df[col].unique().tolist()
        val = list(range(len(uniam)))
        df[col] = df[col].replace(to_replace=uniam, value=val)
    smote = SMOTE(sampling_strategy='minority')
    X_sm, y_sm = smote.fit_sample(df, df_y)
    df = pd.concat([X_sm, y_sm], axis=1)
    return df

In [None]:
def df_to_resam(df):
    # Class count
    count_class_0, count_class_1 = df.target.value_counts()

    # Divide by class
    df_class_0 = df[df['target'] == 0]
    df_class_1 = df[df['target'] == 1]
    df_class_1_over = df_class_1.sample(count_class_0, replace=True)
    df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

    print('Random over-sampling:')
    print(df_test_over.target.value_counts())

    df_test_over.target.value_counts().plot(kind='bar', title='Count (target)');
    return df_test_over

In [None]:
def convetr_to_num(df):
    for col in df.columns:
        uniam = df[col].unique().tolist()
        val = list(range(len(uniam)))
        df[col] = df[col].replace(to_replace=uniam, value=val)
    return df
        

In [None]:
def find_correlations(df, feat_amount=10, label='target'):
#     global CAT_COLUMNS
#     for col in CAT_COLUMNS:
#         df[col] = df[col].astype('category').cat.codes
#     df[CAT_COLUMNS] = df[CAT_COLUMNS].astype('category').cat.codes
#     result_df_corr = df.drop(columns=[label]).corrwith(df[label])
#     df.drop(columns=['Loan ID']).corrwith(df['Loan Status'])
    result_df_corr = df.corr()
    result_df_corr = abs(result_df_corr[label])
    print(result_df_corr[label])
    print(result_df_corr)
    result_df_corr = result_df_corr.sort_values(ascending=False)
    result_df_corr = result_df_corr[1:feat_amount]
#     result = result_df_corr.index.tolist()
    return result_df_corr

In [None]:
def find_K_Best(_x, y, K=10):
    #apply SelectKBest class to extract top K best features
    cols = [col for col in _x.columns if _x[col].dtype != 'object']
    x = _x[cols].copy()
    best_features = SelectKBest(score_func=chi2, k=K)
    fit = best_features.fit(x, y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(x.columns)
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Featues','Score']  #naming the dataframe columns
    feat = featureScores.nlargest(K,'Score')
    print(feat)  #print K best features
#     res = feat['Featues'].values.tolist()
    return feat