In [1]:
!pip install scikit-plot

Collecting scikit-plot
  Downloading https://files.pythonhosted.org/packages/7c/47/32520e259340c140a4ad27c1b97050dd3254fdc517b1d59974d47037510e/scikit_plot-0.3.7-py3-none-any.whl
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras import layers
from keras import models
from keras import utils
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Flatten,Dense,Activation,Embedding,LeakyReLU,BatchNormalization,Dropout
from keras.activations import relu,sigmoid,elu,softmax,tanh,softplus
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.optimizers import RMSprop
from keras import datasets
from keras.callbacks import LearningRateScheduler
from keras.callbacks import History
from keras import losses
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
from sklearn.model_selection import cross_val_score
import scikitplot as skplt

# Reading the data and dropping some unneeded columns

In [3]:
def read():
  df=pd.read_csv('/content/drive/MyDrive/ANN/PS_20174392719_1491204439457_log.csv')
  # list of columns whose data type is object i.e. string
  listOfColumnNames = list((df.dtypes[df.dtypes == np.object]).index)
  #filteredColumns = df.dtypes[df.dtypes == np.object]
  #listOfColumnNames = list(filteredColumns.index)
  #Removing the Name and Surname Columns (You can add more redundant parameters)
  if 'nameOrig' in listOfColumnNames:
    df=df.drop(['nameOrig'],axis=1)
  if 'nameDest' in listOfColumnNames:
    df=df.drop(['nameDest'],axis=1)
  if 'type' in listOfColumnNames:
    df=df.drop(['type'],axis=1)
  if 'nameOrig' in listOfColumnNames:
    listOfColumnNames.remove('nameOrig')
  if 'nameDest' in listOfColumnNames:
    listOfColumnNames.remove('nameDest')
  if 'type' in listOfColumnNames:
    listOfColumnNames.remove('type')
  #Replacing the object type values with classified values (Performance can be improved)
  for i in range(len(listOfColumnNames)):
    s=set(df[listOfColumnNames[i]])
    s=list(s)
    for j in range(len(s)):
      df=df.replace(to_replace=s[j],value=(j+1))
  #Done with data shaping
  return df


# Divide dataframe to X and Y

In [4]:
def divide(path,divider):
  #Division of X and Y
  x=path.drop(divider,axis=1)
  type(divider)
  y=path[divider]
  return x,y

# Splitting to train and test data with split ratio 4:1. Also, dividing Y into categories 0 and 1

In [5]:
# Splitting the dataset into the Training set and Test set
def splitter(x,y):
  from keras.utils import to_categorical
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
  # Feature Scaling
  sc = StandardScaler()
  x_train = sc.fit_transform(x_train)
  x_test = sc.transform(x_test)
  y_train = to_categorical(y_train)
  return x_train,y_train,x_test,y_test

# Making dataset uniform

In [6]:
def inputter(df):
  print("Enter the column you wish to use as  result")
  print("Your options are:",df.columns)
  divider=input()
  #We need to preprocess data a bit more if data is biased towards a particular outcome
  nonf=df[df[divider]==0]
  fra=df[df[divider]==1]
  #I have just taken 5. The value can change
  if nonf.shape>=(5*fra.shape): 
    nonf=nonf.sample(fra.shape[0])
    #print(nonf.shape)
    df=fra.append(nonf,ignore_index=True)
  if fra.shape>=(5*nonf.shape):
    fra=fra.sample(nonf.shape[0])
    #print(fra.shape)
    df=nonf.append(fra,ignore_index=True)
  return df,divider

# Plotting Accuracy and Loss Curve

In [7]:
def plotter(model,epoch):
  Epoch=list()
  for i in range(epoch):
    Epoch.append(i)
  #Summarize history for accuracy
  import plotly.graph_objects as go
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=Epoch, y=model.history['accuracy'],
                           mode='lines',
                           name='Test',
                           line=dict(color='firebrick', width=4)))
  fig.add_trace(go.Scatter(x=Epoch, y=model.history['val_accuracy'],
                           mode='lines',
                           name='Train',
                           line=dict(color='blue', width=4)))

  fig.update_layout(
      title="Accuracy Curve",
      xaxis_title="Epoch",
      yaxis_title="Accuracy",
      font=dict(
          family="Courier New, monospace",
          size=18,
          color="#7f7f7f"
          )
      )
  fig.show()

  #Summarize history for loss
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=Epoch, y=model.history['loss'],
                           mode='lines',
                           name='Test',
                           line=dict(color='firebrick', width=4)))
  fig.add_trace(go.Scatter(x=Epoch, y=model.history['val_loss'],
                           mode='lines',
                           name='Train',
                           line=dict(color='blue', width=4)))

  fig.update_layout(
      title="Loss Curve",
      xaxis_title="Epoch",
      yaxis_title="Loss",
      font=dict(
          family="Courier New, monospace",
          size=18,
          color="#7f7f7f"
          )
      )
  fig.show()


# Plotting confusion matrix, lift curve, etc.

In [8]:
def crossmat(m,x_test,y_test):
  y_pred = m.predict(x_test)
  # print(y_pred[:,1])
  from sklearn.metrics import roc_curve
  from sklearn.metrics import roc_auc_score
  yhat_prob = y_pred[:, 1]
  # print(yhat_prob)
  #ROC-AUC Plot
  ns_probs = [0 for _ in range(len(y_test))]
  ns_auc = roc_auc_score(y_test, ns_probs)
  ann_auc = roc_auc_score(y_test, yhat_prob)
  print('ANN: ROC AUC=%.3f' % (ann_auc))
  ann_fpr, ann_tpr, _ = roc_curve(y_test, yhat_prob)
  ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
  lift(ann_fpr,ann_tpr,ns_fpr,ns_tpr,ann_auc)
  #GINI
  GINI = (2 * ann_auc) - 1
  print('GINI=',GINI)
  #KS- Statistic plot
  plot_ks_statistic(y_test, y_pred)
  #Lift Curve
  plot_lift_curve(y_test,y_pred)
  #Confusion matrix
  from sklearn.metrics import confusion_matrix
  threshold=float(input("Enter threshold value"))
  ycon=y_pred
  ycon[ycon > threshold] = 1
  ycon[ycon <= threshold] = 0
  cnf = confusion_matrix(y_test,ycon[:,1], labels=[1,0])
  confusionmatrix(cnf) 

# ROC Curve

In [9]:
def lift(ann_fpr,ann_tpr,ns_fpr,ns_tpr,ann_auc):
  import plotly.graph_objects as go
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=ann_fpr, y=ann_tpr,
                           mode='lines',
                           name='ANN',
                           line=dict(color='firebrick', width=4)))
  fig.add_trace(go.Scatter(x=ns_fpr, y=ns_tpr,
                           mode='lines',
                           name='No Skill',
                           line=dict(color='blue', width=4,dash='dash')))

  fig.add_annotation(x=max(ns_fpr),y=0,showarrow=False,text="AUC : "+str(ann_auc))

  fig.update_layout(
      title="AUC Curve",
      xaxis_title="False Positive Rate",
      yaxis_title="True Positive Rate",
      font=dict(
          family="Courier New, monospace",
          size=18,
          color="#7f7f7f"
          )
      )
  fig.show()

# Confusion Matrix with plotly

In [10]:
def confusionmatrix(cnf):
  #Udit is adding slider
  confusion_matrix = cnf.astype(int)
  x=['Positive','Negative']
  y=['Positive','Negative']
  z_text = [[str(y) for y in x] for x in cnf]
  import plotly.figure_factory as ff

  fig = ff.create_annotated_heatmap(cnf, x=x, y=y, annotation_text=z_text, colorscale=[
                                                                                       [0, "rgb(230, 248, 255)"],
                                                                                       [0.1, "rgb(230, 248, 255)"],

                                                                                       [0.1, "rgb(179, 233, 255)"],
                                                                                       [0.2, "rgb(179, 233, 255)"],

                                                                                       [0.2, "rgb(128, 219, 255)"],
                                                                                       [0.3, "rgb(128, 219, 255)"],

                                                                                       [0.3, "rgb(77, 204, 255)"],
                                                                                       [0.4, "rgb(77, 204, 255)"],

                                                                                       [0.4, "rgb(26, 190, 255)"],
                                                                                       [0.5, "rgb(26, 190, 255)"],

                                                                                       [0.5, "rgb(0, 164, 230)"],
                                                                                       [0.6, "rgb(0, 164, 230)"],

                                                                                       [0.6, "rgb(0, 128, 179)"],
                                                                                       [0.7, "rgb(0, 128, 179)"],

                                                                                       [0.7, "rgb(0, 91, 128)"],
                                                                                       [0.8, "rgb(0, 91, 128)"],

                   
                                                                                       [0.8, "rgb(0, 73, 102)"],
                                                                                       [0.9, "rgb(0, 73, 102)"],

                                                                                       [0.9, "rgb(0, 55, 77)"],
                                                                                       [1.0, "rgb(0, 55, 77)"]
                                                                                       ])


  fig['data'][0]['showscale'] = True


  fig.update_layout(title="Confusion Matrix",yaxis_title="True Value",xaxis_title="Predicted Value",font=dict(family="Courier New, monospace",size=18,color="#7f7f7f"))

  fig.show()
  TP = confusion_matrix[1, 1]
  TN = confusion_matrix[0, 0]
  FP = confusion_matrix[0, 1]
  FN = confusion_matrix[1, 0]
  a=cnf.shape
  #Calculating false positives
  corrPred=0
  falsePred=0
  for row in range(a[0]):
    for c in range(a[1]):
      if row == c:
        corrPred +=confusion_matrix[row,c]
      else:
        falsePred += confusion_matrix[row,c]
  print('Correct predictions: ', corrPred)
  print('False predictions', falsePred)
  # Calculate Sensitivity
  Sensitivity=TP/(TP+FN)
  #Calculate Specicivity
  Specicivity=TN/(TN+FP)
  print("Sensitivity=",Sensitivity)
  print("Specicivity=",Specicivity)  

# Plotting cumulative gain curve

In [11]:
from scikitplot.helpers import cumulative_gain_curve
import plotly.graph_objects as go
def plot_lift_curve(y_true, y_probas):

    y_true = np.array(y_true)
    y_probas = np.array(y_probas)

    classes = np.unique(y_true)
    if len(classes) != 2:
        raise ValueError('Cannot calculate Lift Curve for data with '
                         '{} category/ies'.format(len(classes)))

    # Compute Cumulative Gain Curves
    percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0],
                                                classes[0])
    percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1],
                                                classes[1])

    percentages = percentages[1:]
    gains1 = gains1[1:]
    gains2 = gains2[1:]

    gains1 = gains1 / percentages
    gains2 = gains2 / percentages

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=percentages, y=gains1,
                    mode='lines',
                    name='Class 0',
                    line=dict(color='blue', width=4)))
    fig.add_trace(go.Scatter(x=percentages, y=gains2,
                    mode='lines',
                    name='Class 1',
                    line=dict(color='orange', width=4)))
    fig.add_trace(go.Scatter(x=[0,1], y=[1,1],
                    mode='lines',
                    name='Baseline',
                    line=dict(color='black', width=4,dash = 'dash')))
    fig.update_layout(
    title="Lift Curve",
    yaxis_title="Lift",
    xaxis_title="Percentage of sample",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"))
    fig.show()

# Plotting KS statistic curve

In [12]:
from scikitplot.helpers import binary_ks_curve
import plotly.graph_objects as go
def plot_ks_statistic(y_true, y_probas, title='KS Statistic Plot'):
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)

    classes = np.unique(y_true)
    if len(classes) != 2:
        raise ValueError('Cannot calculate KS statistic for data with '
                         '{} category/ies'.format(len(classes)))
    probas = y_probas


    thresholds, pct1, pct2, ks_statistic, \
        max_distance_at, classes = binary_ks_curve(y_true,
                                                   probas[:, 1].ravel())

    fig = go.Figure()
    idx = np.where(thresholds == max_distance_at)[0][0]
    fig.add_shape(
       
        dict(
            type="line",
            x0=max_distance_at,
            y0=pct1[idx],
            x1=max_distance_at,
            y1=pct2[idx],
            line=dict(
                color="black",
                width=3,
                dash="dot",
            )
))
    fig.add_trace(go.Scatter(x=thresholds, y=pct2,
                    mode='lines',
                    name='Class 1',
                    line=dict(color='blue', width=4)))
    fig.add_trace(go.Scatter(x=thresholds, y=pct1,
                    mode='lines',
                    name='Class 0',
                    line=dict(color='orange', width=4)))
    fig.add_annotation(
            x=1,
            y=0,
            showarrow=False,
            text='KS Statistic : {:.3f} at {:.3f}'.format(ks_statistic,
                                                            max_distance_at))
    
    fig.update_layout(
    title="KS Statistic",
    yaxis_title="Percentage below threshold",
    xaxis_title="threshold",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
    fig.show()

# User customized ANN Model

In [13]:
def binary_class(x_train,nodes,activation,n):
  #Creating customized ANN Model
  model=Sequential()
  for i in range(len(nodes)):
    if(i==0):
      if(activation=='sigmoid'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'glorot_uniform',activation='sigmoid',input_dim = len(x_train[1])))
      if(activation=='relu'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'he_uniform',activation='relu',input_dim = len(x_train[1])))
      if(activation=='tanh'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'glorot_normal',activation='tanh',input_dim = len(x_train[1])))
      if(activation=='softmax'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'glorot_normal',activation='softmax',input_dim = len(x_train[1])))
      if(activation== 'elu'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'he_normal',activation='elu',input_dim = len(x_train[1])))
      if(activation=='softplus'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'he_normal',activation='softplus',input_dim = len(x_train[1])))
    else:
      if(activation=='sigmoid'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'glorot_uniform',activation='sigmoid'))
      if(activation=='relu'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'he_uniform',activation='relu'))
      if(activation=='tanh'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'glorot_normal',activation='tanh'))
      if(activation=='softmax'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'glorot_uniform',activation='softmax'))
      if(activation=='elu'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'he_normal',activation='elu'))
      if(activation=='softplus'):
        model.add(Dense(units = nodes[i], kernel_initializer = 'he_normal',activation='softplus'))
    model.add(Dropout(n))
  #Adding output layer
  model.add(Dense(units=2, kernel_initializer = 'uniform',activation='sigmoid'))
  return model

# Optimizer for customized model

In [14]:
def optibin(model,opt,x_train,y_train,spl,bs,epochs,x_test,y_test):
  #Choosing the proper optimizer to use
  if(opt=='sgd'):
    print("Enter Momentum:")
    mom=float(input())
    lr=float(input("Enter value of Learning rate:"))
    opti=keras.optimizers.SGD(learning_rate=lr, momentum=mom, nesterov=False)
  if(opt=='Adam'):
    lr=float(input("Enter value of Learning rate:"))
    opti=keras.optimizers.Adam(learning_rate=lr)
  if(opt=='Adamax'):
    lr=float(input("Enter value of Learning rate:"))
    beta_1=float(input("Enter value of beta 1 (Generally close to 1)"))
    beta_2=float(input("Enter value of beta 2 (Generally close to 1)"))
    opti=keras.optimizers.Adamax(learning_rate=lr, beta_1=beta_1, beta_2=beta_2)
  if(opt=='Nadam'):
    lr=float(input("Enter value of Learning rate:"))
    beta_1=float(input("Enter value of beta 1 (Generally close to 1)"))
    beta_2=float(input("Enter value of beta 2 (Generally close to 1)"))
    opti=keras.optimizers.Nadam(learning_rate=lr, beta_1=beta_1, beta_2=beta_2)
  if(opt=='RMSprop'):
    lr=float(input("Enter value of Learning rate:"))
    opti=keras.optimizers.RMSprop(learning_rate=lr)
  if(opt=='Adagrad'):
    lr=float(input("Enter value of Learning rate:"))
    opti=keras.optimizers.Adagrad(learning_rate=lr)
  model.compile(optimizer = opti, loss = 'binary_crossentropy', metrics = ['accuracy'])
  print(model.summary())
  model_history=model.fit(x_train, y_train,validation_split=spl, batch_size = bs,epochs = epochs)
  return model_history, model

# Main calling function

In [19]:
def main():
  seed = 7
  np.random.seed(seed)
  df=read()
  while True:
    print("Ready to work?(Y/N)")
    work=input()
    if(work=='N'):
      break
    df,divider=inputter(df)
    #Dividing dataset to x and y
    x,y=divide(df,divider)
    #Scaling the dataset and division of training and test dataset
    x_train,y_train,x_test,y_test=splitter(x,y)
    print("Number of layers you want") #Between 2 to 9
    nl=int(input())
    nodes=list()
    for i in range(nl):
      nodes.append(int(input("Enter number of neurons in layer: ")))
    epochs=int(input("Enter the number of epochs you want"))
    bs=int(input("Enter the batch size you want"))
    spl=int(input("Enter the percentage of data you want to use for validation"))
    spl=spl*0.01
    print("Choose the activation function for your layers")
    print("1. sigmoid")
    print("2. relu (Recommended. Most worked activation function in industry)")
    print("3. tanh")
    print("4. softmax")
    print("5. elu")
    print("6. softplus")
    activation=input()
    n=int(input("Enter percentage of dropout"))
    n=n*0.01
    model=binary_class(x_train,nodes,activation,n)
    print("Enter optimizer:")
    print("1. sgd")
    print("2. Adam (Hyperparameter tuning shows generally best)")
    print( "3. Adamax")
    print("4. Nadam")
    print("5. Adagrad")
    print("6. RMSprop")
    opt=input()
    m1,m=optibin(model,opt,x_train,y_train,spl,bs,epochs,x_test,y_test)
    print(m1.history.keys())
    plotter(m1,epochs) #Accuracy and Loss Curve
    crossmat(m,x_test,y_test)
  print("Thank You!")

# Checking imbalances for target column

In [16]:
def count_plot(df,fet):
    from plotly.offline import iplot
    import plotly.graph_objs as go
    trace = go.Bar(
                    x=df[fet].value_counts().keys().tolist(),
                    y=df[fet].value_counts().tolist(),
                    marker=dict(color='rgba(171, 50, 96, 0.6)')
                    )
    data = [trace]
    layout = go.Layout(
                   barmode='overlay',
                   title="COUNT PLOT : {} ".format(fet),
                   xaxis=dict(title=fet),
                   yaxis=dict( title='COUNT')
                   )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

In [17]:
import pandas as pd
import plotly.express as px
df1=pd.read_csv('/content/drive/MyDrive/ANN/PS_20174392719_1491204439457_log.csv')
count_plot(df1,'isFraud')

In [21]:
main()

Ready to work?(Y/N)
Y
Enter the column you wish to use as  result
Your options are: Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud', 'isFlaggedFraud'],
      dtype='object')
isFraud
Number of layers you want
3
Enter number of neurons in layer: 3
Enter number of neurons in layer: 10
Enter number of neurons in layer: 20
Enter the number of epochs you want15
Enter the batch size you want16
Enter the percentage of data you want to use for validation20
Choose the activation function for your layers
1. sigmoid
2. relu (Recommended. Most worked activation function in industry)
3. tanh
4. softmax
5. elu
6. softplus
relu
Enter percentage of dropout0
Enter optimizer:
1. sgd
2. Adam (Hyperparameter tuning shows generally best)
3. Adamax
4. Nadam
5. Adagrad
6. RMSprop
Adam
Enter value of Learning rate:0.001
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape 

ANN: ROC AUC=0.958


GINI= 0.9154644942354999


Enter threshold value0.8


Correct predictions:  2824
False predictions 462
Sensitivity= 0.9770531400966184
Specicivity= 0.7398773006134969
Ready to work?(Y/N)
N
Thank You!
