<a href="https://colab.research.google.com/github/SiddharthaPand4/DataExploration/blob/master/NovartisChallengeDataExploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras

import os
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
import zipfile
with zipfile.ZipFile("./input/dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("./input/")

In [None]:
data = pd.read_csv("./input/Dataset/Train.csv")
data.head()

In [None]:
neg = (data.MULTIPLE_OFFENSE==0).sum()
pos = (data.MULTIPLE_OFFENSE==1).sum()
print(data.shape)
print("Negative: {}".format(neg))
print("Positive: {}".format(pos))

In [None]:
neg_percent = neg/(neg+pos)*100
print(neg_percent)

In [None]:
data.describe()

In [None]:
dataset = data.copy()
dataset.pop("DATE")
dataset.pop("INCIDENT_ID")
dataset.head()

In [None]:
train_df, val_df = train_test_split(dataset, test_size=0.2)

train_labels = np.array(train_df.pop('MULTIPLE_OFFENSE'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('MULTIPLE_OFFENSE'))

train_features = np.array(train_df)
val_features = np.array(val_df)

In [None]:
def plot_hist(df, i):
    df["X_{}".format(i)].hist()
    print(df["X_{}".format(i)].unique())

In [None]:
plot_hist(dataset[dataset.MULTIPLE_OFFENSE==1], 12)

In [None]:
dataset[dataset.X_12.isnull()].shape

In [None]:
dataset.head()

In [None]:
test_df = pd.read_csv("./input/Dataset/Test.csv")
test_df.head()

In [None]:
test_df.pop("DATE")
test_df.pop("INCIDENT_ID")
test_features = np.array(test_df)

In [None]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
#print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

In [None]:
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)

sns.jointplot(pos_df['X_3'], pos_df['X_4'],
              kind='hex', xlim = (-5,5), ylim = (-5,5))
plt.suptitle("Positive distribution")

sns.jointplot(neg_df['X_3'], neg_df['X_4'],
              kind='hex', xlim = (-5,5), ylim = (-5,5))
_ = plt.suptitle("Negative distribution")

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def make_model(metrics = METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          16, activation='relu',
          input_shape=(train_features.shape[-1],)),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(lr=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model

In [None]:
EPOCHS = 100
BATCH_SIZE = 2048

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
model = make_model()
model.summary()