# Import Libraries

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
import torch
import torch.nn as nn
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
import pandas as pd


# Read the data

In [8]:

df_train = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/X_train.csv")
df_y_train = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/y_train.csv")
y_test_sample = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/y_test_sample.csv", index_col="index")
df_test = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/X_test.csv", index_col="index")

# Wrappers for the test and train set

In [9]:
class train_set(Dataset):
    '''
    before using dataloader, we have to wrap our train and validation dataset with the following function
    '''
    def __init__(self, df,features, labels):
        self.val = df[features]
        self.labels = labels
        
    def __len__(self):
        return len(self.val)
    
    def __getitem__(self, idx):
        feat_tensors = torch.FloatTensor(self.val.iloc[idx])
        labels_tensors = torch.FloatTensor(self.labels.iloc[idx])
        return feat_tensors, labels_tensors

    
class test_set(Dataset):
    '''
    wrapper for the test set
    '''
    def __init__(self, df, feature):
        self.val = df[feature].values
    
    def __len__(self):
        return len(self.val)
    
    def __getitem__(self, idx):
        feat_tensors = torch.FloatTensor(self.val[idx])
        return feat_tensors

# Neural Network

In [10]:
class Net(nn.Module):
    #the following neural network was taken from https://towardsdatascience.com/pytorch-tabular-binary-classification-a0368da5bb89
    #I was just searching for what might work best for tabular data, I will probably change this NN in the following submissions
    #but I think just trying existed NN is a good point to start.
    def __init__(self):
        super(Net, self).__init__()        
        self.layer_1 = nn.Linear(8, 64) 
        self.layer_2 = nn.Linear(64, 32)
        
        self.layer_out = nn.Linear(32, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(32)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

# For Neural Networks, i used 5 seeds.

In [11]:


def seed_everything(seed=42):
    #this function was taken from one of the notebooks of kaggle competition moa prediction
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True



# Train, validate and test(infere)

In [12]:
def train_model(model, train_loader, optimizer, criterion):
    loss_epoch = 0
    acc = 0
    model.train()
    for X, y in train_loader:
        optimizer.zero_grad()
        predicted = model(X)
        
        loss = criterion(predicted, y)
        loss.backward()
        
        optimizer.step()
        
        with torch.no_grad():
            pred = torch.round(torch.sigmoid(predicted))
            accuracy =  accuracy_score(pred, y) * 100
            
        loss_epoch += loss.item()
        acc += accuracy
        
    return loss/len(train_loader), acc/len(train_loader) 
        

def validate_model(model, val_loader, criterion):
    model.eval()
    
    val_preds = []
    loss_epoch = 0
    acc = 0
    
    for X, y in val_loader:
        with torch.no_grad():
            y_pred = model(X)
            predicted = torch.round(torch.sigmoid(y_pred))
            accuracy =  accuracy_score(predicted, y) * 100
        
        
        loss = criterion(y_pred, y)
        acc += accuracy
        loss_epoch += loss.item()
        
    return loss_epoch/len(val_loader), acc/len(val_loader)

def test_model(model, test_loader):
    model.eval()
    
    y_pred_list = []
    
    for X in test_loader:
        
        with torch.no_grad():
            y_pred = model(X)
            y_pred = torch.sigmoid(y_pred)
            #predicted = torch.round(torch.sigmoid(y_pred))
            #y_pred_list.append(predicted.cpu().numpy())
            y_pred_list.append(y_pred.cpu().numpy())
    y_pred_list = np.concatenate(y_pred_list)        
    return y_pred_list 

# Run different seeds
So the idea is following: divide the data into 5 parts using StartifiedKFold and each time train and validate the model for 10 epochs, the model with best validation accuracy is used predict test set, this predictions(probabilities) is divided by 5(since we have 5 partitions). So do it for all 5 partitions.(sum all of these predictions/5). This is one seed, we have such 5 seeds. Sum of all predictions of each seed, again have to be divided by 5 since we are doing 5 seeds.  

In [13]:
from sklearn.model_selection import StratifiedKFold

features = df_train.columns 
nan_cols = ['monthly_income', 'family_members']#remove the columns which have missing value
#I am removing at all the columns whic have missing values, because even thought I impute them, it worsen the result
#For montly income I imputed by mean and median,I also tried to add the column is_Imputed (1 if yes, 0 for no),
#but it turns out we are doing better without them
features = [i for i in features if i not in nan_cols]
predicted_seed = np.zeros((len(df_test), 1))
skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
for seed in [1, 2, 3, 4, 5]:
    print(f'Seed: {seed}')
    seed_everything(seed = seed)
    predicted = np.zeros((len(df_test), 1))
    for fold, (train_index, val_index) in enumerate(skf.split(df_train, df_y_train)):

        train_data = train_set(df_train.loc[train_index], features, df_y_train.loc[train_index])
        val_data = train_set(df_train.loc[val_index], features, df_y_train.loc[val_index])

        train_loader = DataLoader(dataset=train_data, batch_size= 64, shuffle=True)

        val_loader = DataLoader(dataset=val_data, batch_size=64)

        print(f'fold: {fold}') 
        model = Net()
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr = 0.001)
        best_val = np.inf
        for i in range(10):
            loss, acc = train_model(model, train_loader, optimizer, criterion)
            val_loss, val_acc = validate_model(model, val_loader, criterion)
            print(f'{i}: train: {acc:.2f} and val {val_acc:.2f}')
            if val_loss < best_val:
                best_val = val_loss
                print('Torch is saving..')
                torch.save(model.state_dict(), f"model_{seed}_{fold}.pth")
        model = Net()
        model.load_state_dict(torch.load(f"model_{seed}_{fold}.pth"))

        test_data = test_set(df_test, features)

        test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle = False, pin_memory = True)

        _predicted = test_model(model, test_loader)
        predicted += _predicted/5
    predicted_seed += predicted/5


Seed: 1
fold: 0
0: train: 90.55 and val 93.39
Torch is saving..
1: train: 93.44 and val 93.31
2: train: 93.53 and val 93.29
Torch is saving..
3: train: 93.54 and val 93.57
Torch is saving..
4: train: 93.55 and val 93.49
5: train: 93.53 and val 93.56
6: train: 93.56 and val 93.46
7: train: 93.56 and val 93.36
8: train: 93.54 and val 93.49
9: train: 93.62 and val 93.24
fold: 1
0: train: 90.75 and val 93.29
Torch is saving..
1: train: 93.43 and val 93.42
Torch is saving..
2: train: 93.51 and val 93.36
Torch is saving..
3: train: 93.53 and val 93.47
Torch is saving..
4: train: 93.50 and val 93.69
Torch is saving..
5: train: 93.57 and val 93.42
6: train: 93.53 and val 93.44
Torch is saving..
7: train: 93.61 and val 93.52
8: train: 93.60 and val 93.55
9: train: 93.57 and val 93.44
fold: 2
0: train: 91.86 and val 93.35
Torch is saving..
1: train: 93.45 and val 93.37
Torch is saving..
2: train: 93.51 and val 93.39
3: train: 93.50 and val 93.24
Torch is saving..
4: train: 93.54 and val 93.43
To

# Read the data2(for xgboost and randomforest)

In [14]:

df_train = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/X_train.csv")
df_y_train = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/y_train.csv")
y_test_sample = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/y_test_sample.csv", index_col="index")
df_test = pd.read_csv("/content/drive/MyDrive/bank-scoring-case/X_test.csv", index_col="index")

# GridSearch for best hyperparameters

In [None]:
'''
random_tree = RandomForestClassifier(random_state=42)

features = df_train.columns 
nan_cols = ['monthly_income', 'family_members']
features = [i for i in features if i not in nan_cols]

param_grid = { 
    'n_estimators': [450, 500],
    'max_depth' : [i for i in range(4, 16)],
}


grid = GridSearchCV(estimator=random_tree, param_grid=param_grid, cv= 5)
grid.fit(df_train[features], df_y_train)


xgb = XGBClassifier()
param_grid2 = { 
    'n_estimators': [450, 500],
    'max_depth' : [i for i in range(4, 16)],
}


grid2 = GridSearchCV(estimator=xgb, param_grid=param_grid2, cv= 5)
grid2.fit(df_train[features], df_y_train)


grid2.best_params_['n_estimators'] ---- > 450
grid2.best_params_['max_depth'] ---->4

grid.best_params_['n_estimators'] ---- > 450
grid.best_params_['max_depth'] ---->10
'''



# XGBoost and RandomForest

In [15]:
features = df_train.columns 
nan_cols = ['monthly_income', 'family_members']
features = [i for i in features if i not in nan_cols]

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

predicted = np.zeros((len(df_test), 1))
predicted_xgb = np.zeros((len(df_test), 1))

for fold, (train_index, val_index) in enumerate(skf.split(df_train, df_y_train)):
    train = df_train.loc[train_index, features]
    y_train = df_y_train.loc[train_index]
    val = df_train.loc[val_index, features]
    y_val = df_y_train.loc[val_index]

    random_tree = RandomForestClassifier(n_estimators = 450, max_depth = 10, n_jobs = -1)
    
    model = XGBClassifier(n_estimators = 450,  max_depth = 4)
    model.fit(train, y_train)
    
    random_tree.fit(train, y_train)
    accuracy = accuracy_score(y_val, random_tree.predict(val))
    print(f'accuracy using random forest with 500 trees is {accuracy}')
    predicted += random_tree.predict_proba(df_test[features])[:,1].reshape(-1,1)/5
    predicted_xgb += model.predict_proba(df_test[features])[:,1].reshape(-1,1)/5

accuracy using random forest with 500 trees is 0.9374464846351441
accuracy using random forest with 500 trees is 0.9372562077823233
accuracy using random forest with 500 trees is 0.9371610693559128
accuracy using random forest with 500 trees is 0.936970792503092
accuracy using random forest with 500 trees is 0.9382522239665096


I multipied predictions from random forest and xgboost by 2 since they gave better results than NN indibidually.

In [2]:
predicted = (2*predicted + predicted_seed + 2*predicted_xgb)/5


submission = pd.DataFrame({
        "index": df_test.index,
        "target": predicted.reshape(-1,)
    })




In [None]:
submission.to_csv('submission.csv', index=False)

