In [None]:
import time
import numpy as np
import pandas as pd

In [None]:
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test.csv')

In [None]:
train_csv.head(3)

In [None]:
test_csv.head(3)

In [None]:
train_csv.shape, test_csv.shape, train_csv.isnull().values.any(), test_csv.isnull().values.any(), train_csv.duplicated().sum(), test_csv.duplicated().sum()

In [None]:
train_csv.Response.value_counts(normalize=True)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
train_csv.describe()

In [None]:
def features_info(df):
    feature_info = {
        'feature': df.columns,
        'dtype': [df[col].dtype for col in df.columns],
        'unique_val': [df[col].nunique() for col in df.columns]}
    features_df = pd.DataFrame(feature_info)
    features_df.set_index('feature', inplace=True)
    features_df.sort_values(by='unique_val', ascending=False, inplace=True)
    return features_df

features_info(train_csv)

In [None]:
train_csv.Region_Code.value_counts().tail(2)

In [None]:
train_csv = train_csv.drop(index=train_csv[train_csv.Region_Code==39.2].index)

In [None]:
def data_clean(raw_data):
    raw_data.set_index('id', inplace=True)
    raw_data.drop(columns=['Driving_License'], inplace=True)
    raw_data.Previously_Insured = raw_data.Previously_Insured.astype(str)    
    return raw_data

data_clean(train_csv)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 4, figsize=(10, 3), sharey=True)

for ax, feature in zip(axes, train_csv.select_dtypes(include='object').columns):
    sns.countplot(x=feature, data=train_csv, ax=ax, order=train_csv[feature].value_counts().index)
    total = len(train_csv[feature])
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width() / 2., height + 0.1, f'{height/total:.2%}', ha='center')

for ax, feature in zip(axes, train_csv.select_dtypes(include='object').columns):
    ax.set_title(f'{feature}', fontsize=12)
    ax.set_xlabel('')
    ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
train_1 = train_csv[train_csv.Response==1]
train_0 = train_csv[train_csv.Response==0].sample(train_1.shape[0])
df_train = pd.concat([train_0, train_1])
print(df_train.shape)
df_train.Response.value_counts(normalize=True)

In [None]:
from sklearn.model_selection import train_test_split
from scipy.stats import mannwhitneyu, chi2_contingency

y = df_train['Response']
X = df_train.drop('Response', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df_train['Response'])
df_train = pd.concat([X_train, y_train], axis=1)

In [None]:
features_info(X_train)

In [None]:
continous_features = X_train.select_dtypes(include='float64').drop(columns=['Policy_Sales_Channel', 'Region_Code']).columns
nominal_features = X_train.select_dtypes(include='object').columns
ordinal_features =  X_train.select_dtypes(exclude='object').drop(columns=['Annual_Premium', 'Region_Code']).columns
target_encoded_feature = X_train.select_dtypes(include='float64').drop(columns=['Annual_Premium', 'Policy_Sales_Channel']).columns

print(f"continous_features: {continous_features}")
print(f"nominal_features: {nominal_features}")
print(f"ordinal_features: {ordinal_features}")
print(f"target_encoded_feature: {target_encoded_feature}")

len(continous_features)+len(target_encoded_feature)+len(nominal_features)+len(ordinal_features), X_train.shape[1]

In [None]:
numeric_features = continous_features.append(ordinal_features)

for feature in numeric_features:
    group_0 = df_train[df_train['Response']==0][feature]
    group_1 = df_train[df_train['Response']==1][feature]
    stat, p = mannwhitneyu(group_0, group_1, alternative='two-sided')
    if p < 0.05:
        print('*Feature {} is statistically significant'.format(feature))

for feature in nominal_features:
    contingency_table = pd.crosstab(df_train[feature], df_train['Response'])
    chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
    if p < 0.05:
        print('+Feature {} is statistically significant'.format(feature))
print('\n')

In [None]:
sample = df_train.sample(n=8800) 
df_train = df_train.drop(sample.index)
print(sample.shape)
sample.Response.value_counts(normalize=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, TargetEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

prep = make_column_transformer((StandardScaler(), continous_features),
                               (TargetEncoder(), target_encoded_feature),
                               (OneHotEncoder(drop='first'), nominal_features),
                               (MinMaxScaler(), ordinal_features))
pipelines = {
    'logreg': make_pipeline(prep, LogisticRegression()),
    'forest': make_pipeline(prep, RandomForestClassifier(LogisticRegression())),
    'knn': make_pipeline(prep, KNeighborsClassifier())}

hypergrid = {
    'logreg': {
        'logisticregression__C': [1.4, 2, 2.6, 4, 5.7, 7.8],
        'logisticregression__solver': ['lbfgs', 'liblinear']
    },
    'forest': {
        'randomforestclassifier__n_estimators': [72, 96, 124, 164, 221],
        'randomforestclassifier__criterion': ['gini', 'entropy'],
        'randomforestclassifier__min_samples_leaf': [12, 18, 36, 64, 88]
    },
    'knn': {
        'kneighborsclassifier__n_neighbors': list(range(5, 51, 5)),
        'kneighborsclassifier__weights': ['uniform', 'distance']
    }}

print('Training successfully Begun.\n')

models = {}
for algo, pipeline in pipelines.items():
    print('* Starting training for {}...'.format(algo))
    start = time.time()
    
    model = GridSearchCV(pipeline, hypergrid[algo], cv=10, scoring='roc_auc')
    model.fit(sample.drop('Response', axis=1), sample.Response)
    models[algo] = model
    
    end = time.time()
    print('  {} model fitted. ({:.2f} s)'.format(algo, end-start))
print('\n')

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix

y_test_summary = pd.DataFrame()

for algo in pipelines.keys():
    y_pred = models[algo].predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    y_test_summary.loc[algo,['Sensitivity']] = round(TP/float(TP + FN), 3)
    y_test_summary.loc[algo,['Specificity']] = round(TN/float(TN + FP), 3)
    y_pred = models[algo].predict_proba(X_test)[:, 1]
    y_test_summary.loc[algo,['AUC']] = roc_auc_score(y_test, y_pred)

print('Predictions perspectives (sklearn algos):')
y_test_summary.sort_values(by='AUC', ascending=False)

In [None]:
models['forest'].best_params_

In [None]:
best_params = models['forest'].best_params_

prep = make_column_transformer((StandardScaler(), continous_features),
                               (TargetEncoder(), target_encoded_feature),
                               (OneHotEncoder(drop='first'), nominal_features),
                               (MinMaxScaler(), ordinal_features))

pipe = make_pipeline(prep, RandomForestClassifier(n_estimators = models['forest'].best_params_['randomforestclassifier__n_estimators'],
                                                  criterion = models['forest'].best_params_['randomforestclassifier__criterion'],
                                                  min_samples_leaf = models['forest'].best_params_['randomforestclassifier__min_samples_leaf']))

pipe.fit(X_train, y_train)

In [None]:
cm = confusion_matrix(y_test, pipe.predict(X_test))
TN, FP, FN, TP = cm.ravel()


Sensitivity = round(TP/float(TP + FN), 3)
Specificity = round(TN/float(TN + FP), 3)
Precision = round(TP/float(TP + FP), 3)
print('Sensitivity: {}, Specificity: {}, Precision: {}'.format(Sensitivity, Specificity, Precision))

from sklearn.metrics import roc_curve, auc


y_pred_proba = pipe.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)


fig, axes = plt.subplots(1, 2, figsize=(11, 4))


sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'], cbar=False, ax=axes[0])
axes[0].set_title('Confusion Matrix', fontsize=21)
axes[0].set_xlabel('Predicted Label', fontsize=16)
axes[0].set_ylabel('True Label', fontsize=16)
axes[0].tick_params(axis='both', which='major', labelsize=14)


axes[1].plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
axes[1].plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')  # Diagonal line
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('(1 - Specificity)', fontsize=18)
axes[1].set_ylabel('Sensitivity', fontsize=18)
axes[1].set_title('ROC Curve', fontsize=18)
axes[1].legend(loc="lower right", fontsize=14)

default_threshold_index = np.where(thresholds > 0.5)[0][-1]
axes[1].plot(fpr[default_threshold_index], tpr[default_threshold_index], 'ro', label='Threshold = 0.5')
axes[1].legend(loc="lower right", fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
%%time
test = pd.read_csv('test.csv')

y_hat = pipe.predict_proba(data_clean(test))[:, 1]
submission = pd.DataFrame(test.reset_index().id).assign(Response=y_hat)
submission

In [None]:
# Save the submission
submission.to_csv('kaggle_submission.csv', index=False) 