In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from matplotlib import pyplot as plt
import xgboost as xgb
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC

import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.utils import data as data_utils

In [None]:
# Read data 
df = pd.read_csv('../input/skill-task2/train.csv/train.csv')
df_test = pd.read_csv('../input/skill-task2/test.csv/test.csv')

In [None]:
# Create copy of a dataframe
data = df.copy()

In [None]:
data.head()

In [None]:
# Get general information about the dataframe
data.info()

In [None]:
# Get basic statistical characteristics
data.describe()

In [None]:
# Let's look at statistics on non-numerical features. All values are unique.
data.describe(include=['object'])

In [None]:
# Check label values
data['y'].unique()

In [None]:
# Check if any columns are empty
col_names = data.columns
for col in col_names:
    if data[col].empty:
        print(f'DataFrame column {name} is empty!')

In [None]:
# Check class distribution
sns.countplot(data['y'],label="Sum")
plt.show()

In [None]:
# Replace inf values with np.nan, then replace nan with 0
data.replace([np.inf, -np.inf], np.nan,inplace=True)
data = data.fillna(0) # Check mean

In [None]:
# Check if contains null values
data.isnull().values.any()

In [None]:
print("All values are finite: ", np.all(np.isfinite(data.iloc[:,1:].head())))

### Normalize features and split data

In [None]:
# Features
X = data.drop(['sample_id', 'y'], axis=1)
# Labels
y = data['y']

In [None]:
X

In [None]:
# Features normalization
features_norm = StandardScaler() 
X_std = features_norm.fit_transform(X) 


In [None]:
# Split data in train/test
X_train, x_test, Y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=42)

### Create torch tensors

In [None]:
# To torch tensor: Train
X_train_tensor = torch.tensor(X_train, dtype=torch.float)
Y_train_tensor = torch.tensor(Y_train.values).flatten() 

# Test
x_test_tensor = torch.tensor(x_test, dtype=torch.float)
y_test_tensor = torch.tensor(y_test.values).flatten() 

### Train base models using cross-validation

In [None]:
# Random forest classifier
rf = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_rf = cross_validate(rf, X_std, y, scoring=scoring, cv=5)

sorted(scores_rf.keys())
forest_fit_time = scores_rf['fit_time'].mean()
forest_score_time = scores_rf['score_time'].mean()
forest_accuracy = scores_rf['test_accuracy'].mean()
forest_precision = scores_rf['test_precision_macro'].mean()
forest_recall = scores_rf['test_recall_macro'].mean()
forest_f1 = scores_rf['test_f1_weighted'].mean()
forest_roc = scores_rf['test_roc_auc'].mean()

In [None]:
# XGBoost classifier
xgb_clf = XGBClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_xgb = cross_validate(xgb_clf, X_std, y, scoring=scoring, cv=5)

sorted(scores_xgb.keys())
XGB_fit_time = scores_xgb['fit_time'].mean()
XGB_score_time = scores_xgb['score_time'].mean()
XGB_accuracy = scores_xgb['test_accuracy'].mean()
XGB_precision = scores_xgb['test_precision_macro'].mean()
XGB_recall = scores_xgb['test_recall_macro'].mean()
XGB_f1 = scores_xgb['test_f1_weighted'].mean()
XGB_roc = scores_xgb['test_roc_auc'].mean()

In [None]:
# Support vector machine
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_svm = cross_validate(SVM, X_std, y, scoring=scoring, cv=5)

sorted(scores_svm.keys())
SVM_fit_time = scores_svm['fit_time'].mean()
SVM_score_time = scores_svm['score_time'].mean()
SVM_accuracy = scores_svm['test_accuracy'].mean()
SVM_precision = scores_svm['test_precision_macro'].mean()
SVM_recall = scores_svm['test_recall_macro'].mean()
SVM_f1 = scores_svm['test_f1_weighted'].mean()
SVM_roc = scores_svm['test_roc_auc'].mean()

In [None]:
# K-Nearest Neighbors
KNN = KNeighborsClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_knn = cross_validate(KNN, X_std, y, scoring=scoring, cv=5)

sorted(scores_knn.keys())
KNN_fit_time = scores_knn['fit_time'].mean()
KNN_score_time = scores_knn['score_time'].mean()
KNN_accuracy = scores_knn['test_accuracy'].mean()
KNN_precision = scores_knn['test_precision_macro'].mean()
KNN_recall = scores_knn['test_recall_macro'].mean()
KNN_f1 = scores_knn['test_f1_weighted'].mean()
KNN_roc = scores_knn['test_roc_auc'].mean()

In [None]:
# Comparison of algorithms
models_initial = pd.DataFrame({
    'Model'       : ['Support Vector Machine', 'Random Forest', 'XGBClassifier', 'KNN'],
    'Fitting time': [SVM_fit_time, forest_fit_time, XGB_fit_time, KNN_fit_time],
    'Scoring time': [SVM_score_time, forest_score_time, XGB_score_time, KNN_score_time],
    'Accuracy'    : [SVM_accuracy, forest_accuracy, XGB_accuracy, KNN_accuracy],
    'Precision'   : [SVM_precision, forest_precision, XGB_precision, KNN_precision],
    'Recall'      : [SVM_recall, forest_recall, XGB_recall, KNN_recall],
    'F1_score'    : [SVM_f1, forest_f1, XGB_f1, KNN_f1],
    'ROC_AUC'     : [SVM_roc, forest_roc, XGB_roc, KNN_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'ROC_AUC'])

models_initial.sort_values(by='ROC_AUC', ascending=False)

In [None]:
ax = models_initial.sort_values(by='ROC_AUC', ascending=False).plot.bar(x='Model', y='ROC_AUC', rot=30)
ax.set_ylabel("ROC_AUC")

Recap: Random forest has the lowest accuracy, so we will not use it in stacking

### Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit_transform(X_std)

In [None]:
explained_variance=pca.explained_variance_

In [None]:
# Plot individual explained variance over components 
plt.figure(figsize=(10, 6))

plt.bar(range(len(explained_variance)), explained_variance, alpha=0.5, align='center',
        label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
pca = PCA(n_components=190)
pca.fit_transform(X_std)

In [None]:
print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())

In [None]:
# We can see that the 1st aprox 15 components retains more than 99% of the data.
# Let us take only first 15 principal components and visualise it using K-means clustering
X_std_pca = pca.fit_transform(X_std)
plt.figure(figsize = (5,5))
plt.scatter(X_std_pca[:,0],X_std_pca[:,1])
plt.show()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=5)
X_clustered = kmeans.fit_predict(X_std_pca)

LABEL_COLOR_MAP = {0 : 'g',
                   1 : 'y'
                  }

label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
plt.figure(figsize = (5,5))
plt.scatter(X_std_pca[:,0],X_std_pca[:,1], c= label_color)
plt.show()

In [None]:
# Random forest classifier
rf = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_rf = cross_validate(rf, X_std_pca, y, scoring=scoring, cv=5)

sorted(scores_rf.keys())
forest_fit_time = scores_rf['fit_time'].mean()
forest_score_time = scores_rf['score_time'].mean()
forest_accuracy = scores_rf['test_accuracy'].mean()
forest_precision = scores_rf['test_precision_macro'].mean()
forest_recall = scores_rf['test_recall_macro'].mean()
forest_f1 = scores_rf['test_f1_weighted'].mean()
forest_roc = scores_rf['test_roc_auc'].mean()

In [None]:
# XGBoost classifier
xgb_clf = XGBClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_xgb = cross_validate(xgb_clf, X_std_pca, y, scoring=scoring, cv=5)

sorted(scores_xgb.keys())
XGB_fit_time = scores_xgb['fit_time'].mean()
XGB_score_time = scores_xgb['score_time'].mean()
XGB_accuracy = scores_xgb['test_accuracy'].mean()
XGB_precision = scores_xgb['test_precision_macro'].mean()
XGB_recall = scores_xgb['test_recall_macro'].mean()
XGB_f1 = scores_xgb['test_f1_weighted'].mean()
XGB_roc = scores_xgb['test_roc_auc'].mean()

In [None]:
# Support vector machine
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

# Train with cross_validation
scores_svm = cross_validate(SVM, X_std_pca, y, scoring=scoring, cv=5)

sorted(scores_svm.keys())
SVM_fit_time = scores_svm['fit_time'].mean()
SVM_score_time = scores_svm['score_time'].mean()
SVM_accuracy = scores_svm['test_accuracy'].mean()
SVM_precision = scores_svm['test_precision_macro'].mean()
SVM_recall = scores_svm['test_recall_macro'].mean()
SVM_f1 = scores_svm['test_f1_weighted'].mean()
SVM_roc = scores_svm['test_roc_auc'].mean()

In [None]:
# K-Nearest Neighbors
KNN = KNeighborsClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores_knn = cross_validate(KNN, X_std_pca, y, scoring=scoring, cv=5)

sorted(scores_knn.keys())
KNN_fit_time = scores_knn['fit_time'].mean()
KNN_score_time = scores_knn['score_time'].mean()
KNN_accuracy = scores_knn['test_accuracy'].mean()
KNN_precision = scores_knn['test_precision_macro'].mean()
KNN_recall = scores_knn['test_recall_macro'].mean()
KNN_f1 = scores_knn['test_f1_weighted'].mean()
KNN_roc = scores_knn['test_roc_auc'].mean()

In [None]:
# Comparison of algorithms
models_pca = pd.DataFrame({
    'Model'       : ['Support Vector Machine', 'Random Forest', 'XGBClassifier', 'KNN'],
    'Fitting time': [SVM_fit_time, forest_fit_time, XGB_fit_time, KNN_fit_time],
    'Scoring time': [SVM_score_time, forest_score_time, XGB_score_time, KNN_score_time],
    'Accuracy'    : [SVM_accuracy, forest_accuracy, XGB_accuracy, KNN_accuracy],
    'Precision'   : [SVM_precision, forest_precision, XGB_precision, KNN_precision],
    'Recall'      : [SVM_recall, forest_recall, XGB_recall, KNN_recall],
    'F1_score'    : [SVM_f1, forest_f1, XGB_f1, KNN_f1],
    'ROC_AUC'     : [SVM_roc, forest_roc, XGB_roc, KNN_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'ROC_AUC'])

models_pca.sort_values(by='ROC_AUC', ascending=False)

Auc has decreased after pca

### Voting classifier

In [None]:
models = [SVC(probability = True), XGBClassifier(), RandomForestClassifier()]

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

In [None]:
models_ens = list(zip(['SVM', 'XGB', 'RF'], models))
model_ens = VotingClassifier(estimators = models_ens, voting = 'soft')
model_ens.fit(X_train, Y_train)
pred = model_ens.predict(x_test)
prob_voting = model_ens.predict_proba(x_test)[:,1]

acc_soft = accuracy_score(y_test, pred)
prec_soft = precision_score(y_test, pred)
recall_soft = recall_score(y_test, pred)
f1_soft = f1_score(y_test, pred)
roc_auc_soft = roc_auc_score(y_test, prob_voting)

In [None]:
models_ensembling = pd.DataFrame({
    'Model'       : ['Ensembling_soft'],
    'Accuracy'    : [acc_soft],
    'Precision'   : [prec_soft],
    'Recall'      : [recall_soft],
    'F1_score'    : [f1_soft],
    'ROC_AUC'     : [roc_auc_soft],
    }, columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'ROC_AUC'])

models_ensembling.sort_values(by='ROC_AUC', ascending=False)

### Feature selection using LinearSVC

In [None]:
lsvc = LinearSVC().fit(X_train, Y_train)
model = SelectFromModel(lsvc, prefit=True)
X_train_svc = model.transform(X_train)
X_train_svc.shape

In [None]:
x_test_svc = model.transform(x_test)
x_test_svc.shape

In [None]:
models = [SVC(probability = True), XGBClassifier(), RandomForestClassifier()]

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

In [None]:
models_ens_svc = list(zip(['SVM', 'XGB', 'KNN'], models))
model_ens_svc = VotingClassifier(estimators = models_ens_svc, voting = 'soft')
model_ens_svc.fit(X_train_svc, Y_train)
pred = model_ens_svc.predict(x_test_svc)
prob = model_ens_svc.predict_proba(x_test_svc)[:,1]

acc_soft = accuracy_score(y_test, pred)
prec_soft = precision_score(y_test, pred)
recall_soft = recall_score(y_test, pred)
f1_soft = f1_score(y_test, pred)
roc_auc_soft = roc_auc_score(y_test, prob)

In [None]:
models_ensembling_features_svc = pd.DataFrame({
    'Model'       : ['Ensembling_soft'],
    'Accuracy'    : [acc_soft],
    'Precision'   : [prec_soft],
    'Recall'      : [recall_soft],
    'F1_score'    : [f1_soft],
    'ROC_AUC'     : [roc_auc_soft],
    }, columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'ROC_AUC'])

models_ensembling_features_svc.sort_values(by='ROC_AUC', ascending=False)

### General comparison


In [None]:
model_general = pd.concat([models_initial['Model'], models_initial['ROC_AUC'], 
                           models_pca['Model'], models_pca['ROC_AUC'],
                           models_ensembling['Model'], models_ensembling['ROC_AUC'],
                           models_ensembling_features_svc['Model'], models_ensembling_features_svc['ROC_AUC']
                           ]
                          , axis=1)

model_general.columns = ['Base models', 'AUC 1',
                         'Models PCA', 'AUC 2',
                         'Models ensembling', 'AUC 3',
                        'Features sel. SVC (ensembl)', 'AUC 4']

model_general.sort_values(by='AUC 1', ascending=False)

### Plotting ROC-AUC curve for voting classifier (AUC 3)

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, prob_voting)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

### Submission classic ML


In [None]:
# Replace inf values with np.nan, then replace nan with 0
df_test.replace([np.inf, -np.inf], np.nan,inplace=True)
df_test = df_test.fillna(0) 

# Features
X_submission = df_test.drop(['sample_id'], axis=1)

X_submission_std = features_norm.fit_transform(X_submission) 

answ = model_ens.predict_proba(X_submission_std)[:,1]

submission = pd.DataFrame(df_test["sample_id"], index=None)
submission["y"] = answ
submission.to_csv("submission_voting_classifier.csv", sep=",", index=False)
submission.head()

## Neural network

In [None]:
# Create train dataloader
batch_size = 128

train_dataset = data_utils.TensorDataset(X_train_tensor, Y_train_tensor) 
train_loader = data_utils.DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)

# Create eval dataloader

eval_dataset = data_utils.TensorDataset(x_test_tensor, y_test_tensor) 
eval_loader = data_utils.DataLoader(dataset = eval_dataset, batch_size = batch_size, shuffle = True)

In [None]:
# Check batch sizes
for data, labels in eval_loader:
    print(data.size())
    print(labels.size())
    break

In [None]:
# Class must extend nn.Module
class MyClassifier(nn.Module):
    def __init__(self):
        super(MyClassifier,self).__init__()
        # Our network consists of 3 layers. 1 input, 1 hidden and 1 output layer
         
        self.fc1 = nn.Linear(1612,200)
        self.fc2 = nn.Linear(200,100)
        self.layer_out = nn.Linear(100,1)
        
        self.dropout = nn.Dropout()
        
        
        
        self.bn0 = nn.BatchNorm1d(1612)
        self.bn1 = nn.BatchNorm1d(200)
        
        self.bn_out = nn.BatchNorm1d(100)
        
        
        
    
    def forward(self,x):
        
        # Batch normalization
        x = self.bn0(x)
        
        # This applies Linear transformation to input data with non-linear activation
        x = F.relu(self.fc1(x))
        
        # Dropout
        x = self.dropout(x) 
        
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x) 
        
        
        x = self.bn_out(x)
        #This applies linear transformation to produce output data
        x = self.layer_out(x)
        
        return x
        
    
    
    
    

In [None]:
# Initialize the model        
network = MyClassifier()
# Define loss criterion
criterion = nn.BCEWithLogitsLoss()
# Define the optimizer
optimizer = torch.optim.Adam(network.parameters(), lr=1e-3)

In [None]:
network

In [None]:
def train_model(model, optim, criterion, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = y.shape[0]
        output = model(x)   
        loss = criterion(output, y.unsqueeze(1))   
        optim.zero_grad()
        loss.backward()
        
        # Clip gradient 
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optim.step()
        
        # Accumulate epoch loss 
        total += batch
        sum_loss += batch*(loss.item())
        # print("Batch loss: ", batch*(loss.item()))
    return sum_loss/total

In [None]:
#This function takes an input and predicts the class, (0 or 1)        
def predict(x, model):
    with torch.no_grad():
        y_pred = model(x)
        y_pred_tag = torch.round(torch.sigmoid(y_pred))
    return torch.tensor(y_pred_tag, dtype=float)

def predict_proba(x, model):
    with torch.no_grad():
        y_pred = model(x)
        prob = torch.sigmoid(y_pred)    
    return torch.tensor(prob, dtype=float)

### Train model

In [None]:
#Number of epochs
epochs = 150
#List to store losses
train_losses = []
for i in range(epochs):
    epoch_loss = train_model(model=network, optim=optimizer, criterion=criterion, train_dl=train_loader)
    train_losses.append(epoch_loss)
    if i % 10 == 0:
        print("Epoch {0}, Loss {1}".format(i+1, epoch_loss))

In [None]:
plt.plot(range(epochs), train_losses, label='Train loss')
plt.ylabel('Loss')
plt.xlabel('epoch');

In [None]:
print("AUC ", roc_auc_score(y_test_tensor.long(), predict_proba(x_test_tensor, model=network)))

### Submission NN

In [None]:
# Replace inf values with np.nan, then replace nan with 0
df_test.replace([np.inf, -np.inf], np.nan,inplace=True)
df_test = df_test.fillna(0) 

# Features
X_submission = df_test.drop(['sample_id'], axis=1)

X_submission_std = features_norm.fit_transform(X_submission) 

X_submission_tensor = torch.tensor(X_submission_std, dtype=torch.float)

a = predict_proba(X_submission_tensor, model=network).numpy()

submission_network = pd.DataFrame(df_test["sample_id"], index=None)
submission_network["y"] = a
submission_network.to_csv("submission_NN.csv", sep=",", index=False)
submission_network.head()

### Recap: final submission - `submission_voting_classifier.csv` 

Stacking classic ml models => AUC = 89%+ Neural network =>