<a href="https://colab.research.google.com/github/SiddharthaPand4/DataExploration/blob/master/NovartisChallengeSolution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

import os
import zipfile

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [None]:
DATA_ZIP_DIR = "/tmp/dataset.zip"
DATASET_DIR = "/tmp/"

In [None]:
with zipfile.ZipFile(DATA_ZIP_DIR, 'r') as zip_ref:
    zip_ref.extractall(DATASET_DIR)

In [None]:
TRAIN_DIR_PATH = "/tmp/Dataset/Train.csv"
TEST_DIR_PATH = "/tmp/Dataset/Test.csv"

In [None]:
data = pd.read_csv(TRAIN_DIR_PATH)
data.sample(frac=1)
test_data = pd.read_csv(TEST_DIR_PATH)

In [None]:
data.head()

In [None]:
test_data.head()

In [None]:
data.describe()

In [None]:
print(data.isnull().sum())
print(test_data.isnull().sum())

In [None]:
data.X_12.value_counts()
test_data.X_12.value_counts()

In [None]:
data.X_12.fillna(value=9999, inplace=True)
test_data.X_12.fillna(value=9999, inplace=True)

In [None]:
for i in range(1, 16):
    intersection = len(list(set(data["X_{}".format(i)].unique())  & set(test_data["X_{}".format(i)].unique())))
    data_len = len(data["X_{}".format(i)].unique())
    test_len = len(test_data["X_{}".format(i)].unique())
    print(intersection)
    print(data_len)
    print(test_len)
    print()

In [None]:
def remove_faltu_features(df):
    df1 = df.copy()
    df1.pop("INCIDENT_ID")
    df1.pop("DATE")
    return df1

In [None]:
def preprocess_data(df, is_train=True):
    y = None
    if is_train:
        y = np.array(df.pop("MULTIPLE_OFFENSE"))
    features = np.array(remove_faltu_features(df))
    return features, y

In [None]:
features, y = preprocess_data(data)
test_features, _ = preprocess_data(test_data, is_train=False)

print(features.shape)
print(test_features.shape)
print(y.shape)

In [None]:
split_ratio = .8

def split_data(features, labels):
    train_size = int(features.shape[0] * split_ratio)
    X_train = features[:train_size]
    X_val = features[train_size:]
    y_train = labels[:train_size]
    y_val = labels[train_size:]
    return X_train, y_train, X_val, y_val

In [None]:
X_train, y_train, X_val, y_val = split_data(features, y)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def make_model(metrics = METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          1024, activation='relu',
          input_shape=(X_train.shape[-1],)),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(128, activation='relu'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(lr=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model

In [None]:
model = make_model()
model.summary()

In [None]:
model.fit(x=X_train, y=y_train, batch_size=2048, epochs=700, validation_data=(X_val, y_val), verbose=1)

In [None]:
preds = model.predict(test_features)

In [None]:
preds.shape

In [None]:
predictions = (preds > 0.5).astype('int64')
predictions = np.squeeze(predictions)
print(predictions.shape)

In [None]:
test_csv = pd.read_csv(TEST_DIR_PATH)
IDs = np.array(test_csv.pop('INCIDENT_ID'))
IDs.shape

In [None]:
output = pd.DataFrame({'INCIDENT_ID': IDs, 'MULTIPLE_OFFENSE': predictions})

In [None]:
output.to_csv('/tmp/my_submission.csv', index=False)