In [None]:
import numpy as np              #Linear algebra
import pandas as pd             #Data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #
import seaborn as sns
import sklearn

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold, train_test_split

import optuna
from optuna.samplers import TPESampler

from pyod.models.copod import COPOD

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.utils import to_categorical

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
train = pd.read_csv('train.csv')
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.corr()

In [None]:
print('Percent of Missing Values :')
print("-"*25)
print(round(train.isnull().sum()/train.shape[0]*100,2))

In [None]:
sns.countplot(train.Response)

In [None]:
train.Response.value_counts()

In [None]:
sns.distplot(train.Age)

In [None]:
sns.boxplot(y = 'Age', data = train)

In [None]:
sns.scatterplot(x=train['Age'], y=train['Annual_Premium'])

In [None]:
sns.countplot(train.Gender)

In [None]:
df=train.groupby(['Gender','Response'])['id'].count().to_frame().rename(columns={'id':'Count'}).reset_index()
df.head()

In [None]:
sns.catplot(x="Gender", y="Count", col="Response",
                data=df, kind="bar");

In [None]:
df=train.groupby(['Gender', 'Driving_License'])['id'].count().to_frame().rename(columns={'id':'Count'}).reset_index()
df

In [None]:
sns.catplot(x="Gender", y="Count", col="Driving_License",
                data=df, kind="bar");

In [None]:
sns.countplot(train['Previously_Insured'])

In [None]:
sns.countplot(train.Vehicle_Age)

In [None]:
df=train.groupby(['Vehicle_Age','Response'])['id'].count().to_frame().rename(columns={'id':'count'}).reset_index()
df

In [None]:
g = sns.catplot(x="Vehicle_Age", y="count",col="Response",
                data=df, kind="bar",
                height=4, aspect=.7)

In [None]:
sns.countplot(train.Vehicle_Damage)

In [None]:
df=train.groupby(['Vehicle_Damage','Response'])['id'].count().to_frame().rename(columns={'id':'count'}).reset_index()
df

In [None]:
sns.catplot(x="Vehicle_Damage", y="count",col="Response",
                data=df, kind="bar",
                height=4, aspect=.7)

In [None]:
sns.distplot(train.Annual_Premium)

In [None]:
sns.boxplot(y = 'Annual_Premium', data = train)

In [None]:
sns.distplot(train.Vintage)

Data Preprocessing

In [None]:
#train = train.drop(columns=['id'])
#Run only once

plt.figure(figsize=(10,10))
plt.title("Correlation plot")
sns.heatmap(train.corr(),linewidths=5, annot=True, square=True,annot_kws={'size': 10},cmap='YlGnBu')

In [None]:
train.head()

In [None]:
train.loc[train['Gender'] == 'Male', 'Gender'] = 1
train.loc[train['Gender'] == 'Female', 'Gender'] = 0

train.loc[train['Vehicle_Age'] == '> 2 Years', 'Vehicle_Age'] = 2
train.loc[train['Vehicle_Age'] == '1-2 Year', 'Vehicle_Age'] = 1
train.loc[train['Vehicle_Age'] == '< 1 Year', 'Vehicle_Age'] = 0

train.loc[train['Vehicle_Damage'] == 'Yes', 'Vehicle_Damage'] = 1
train.loc[train['Vehicle_Damage'] == 'No', 'Vehicle_Damage'] = 0
#Run only once

train.head()

In [None]:
for col in train.columns:
    train[col] = train[col].astype(np.int32)

train

In [None]:
for col in train.columns:
    if col == 'Response':
        continue
    print(col, "\t\t", train[col].corr(train['Response']))

In [None]:
X = train.drop(['Response'], axis=1)
y = train['Response']

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=669).fit(X)      #random_state of avg of GR numbers

In [None]:
train['cluster'] = kmeans.labels_
train

In [None]:
train['cluster'].value_counts()

In [None]:
print('Kmeans accuracy: ', accuracy_score(train['Response'], train['cluster']))
print('Kmeans f1_score: ', f1_score(train['Response'], train['cluster']))

In [None]:
response = train['Response']
train = train.drop(['Response', 'cluster'], axis=1)

In [None]:
clf = COPOD(
    contamination=0.15
)
clf.fit(train)

In [None]:
cluster = clf.predict(train)
train['cluster'] = cluster
train['Response'] = response
train

In [None]:
train['cluster'].value_counts()

In [None]:
print('COPOD accuracy: ', accuracy_score(train['Response'], train['cluster']))
print('COPOD f1_score: ', f1_score(train['Response'], train['cluster']))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [None]:
print('Positive cases % in validation set: ', round(100 * len(y_test[y_test == 1]) / len(y_test), 3), '%')
print('Positive cases % in train set: ', round(100 * len(y_train[y_train == 1]) / len(y_train), 3), '%')

In [None]:
model = LogisticRegression(random_state=666)
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)
print('Simple Logistic Regression accuracy: ', accuracy_score(y_test, preds))
print('Simple Logistic Regression f1_score: ', f1_score(y_test, preds))

In [None]:
def plot_confusion_matrix(y_real, y_pred):
    cm = confusion_matrix(y_real, y_pred)

    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt='g')

    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')

In [None]:
plot_confusion_matrix(y_test, preds)

In [None]:
X_train = X_train.drop(['Region_Code', 'Vintage', 'Driving_License'], axis=1)
X_test = X_test.drop(['Region_Code', 'Vintage', 'Driving_License'], axis=1)

In [None]:
model = LogisticRegression(random_state=666)
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)
print('Simple Logistic Regression accuracy: ', accuracy_score(y_test, preds))
print('Simple Logistic Regression f1_score: ', f1_score(y_test, preds))

In [None]:
plot_confusion_matrix(y_test, preds)

In [None]:
model = LGBMClassifier(random_state=666)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print('Simple LGBM accuracy: ', accuracy_score(y_test, preds))
print('Simple LGBM Regression f1_score: ', f1_score(y_test, preds))

In [None]:
np.random.seed(666)
sampler = TPESampler(seed=0)

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 20)
    n_estimators = trial.suggest_int("n_estimators", 1, 400)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    gamma = trial.suggest_uniform('gamma', 0.0000001, 1)
    scale_pos_weight = trial.suggest_int("scale_pos_weight", 1, 20)
    model = XGBClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        gamma=gamma, 
        scale_pos_weight=scale_pos_weight, 
        random_state=0
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = f1_score(y_test, preds)
    return score

#study = optuna.create_study(direction="maximize", sampler=sampler)
#study.optimize(objective, n_trials=500)

#xgb_params = study.best_params
xgb_params = {
    'max_depth': 4, 
    'n_estimators': 372, 
    'learning_rate': 0.09345905554110154, 
    'gamma': 0.6641238000625036, 
    'scale_pos_weight': 4
}
xgb_params['random_state'] = 0
xgb = XGBClassifier(**xgb_params)
xgb.fit(X_train, y_train)
preds = xgb.predict(X_test)
print('Optimized XGBClassifier accuracy: ', accuracy_score(y_test, preds))
print('Optimized XGBClassifier f1-score', f1_score(y_test, preds))

In [None]:
plot_confusion_matrix(y_test, preds)

In [None]:
def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 7)
    n_estimators = trial.suggest_int("n_estimators", 2, 200)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    model = RandomForestClassifier(
        min_samples_leaf=min_samples_leaf, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        random_state=0
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = f1_score(y_test, preds)
    return score

study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)
rf_params = study.best_params
rf = RandomForestClassifier(**rf_params)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
print('Optimized RF accuracy: ', accuracy_score(y_test, preds))
print('Optimized RF f1-score:', f1_score(y_test, preds))

In [None]:
plot_confusion_matrix(y_test, preds)

In [None]:
def recall_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def keras_f1_score(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(7),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(30, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(
        loss=tf.keras.losses.binary_crossentropy, 
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=[keras_f1_score]
    )
    return model

In [None]:
y_nn_train = to_categorical(y_train)

In [None]:
class_weight = {
    0: 1.,
    1: 8.
}

In [None]:
model = create_model()
model.fit(X_train, y_nn_train, validation_split=0.2, epochs=35, batch_size=256, verbose=2, class_weight=class_weight)

In [None]:
preds = model.predict(X_test)
preds = np.argmax(preds, axis=1)

In [None]:
print('NN accuracy: ', accuracy_score(y_test, preds))
print('NN f1-score', f1_score(y_test, preds))

In [None]:
plot_confusion_matrix(y_test, preds)