# **Setup**

In [90]:
import os, warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import auc, classification_report, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

### Load Data

In [91]:
train = pd.read_csv('../data/Train.csv')
test = pd.read_csv('../data/Test.csv')
submission = pd.read_csv('data/SampleSubmission.csv')

In [92]:
#show train data 

train.head() 

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,...,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37,Target
0,ID_000J8GTZ,1,6,35.0,2,,,2,,,...,,,1.0,,,,1.0,1.0,0,0
1,ID_000QLXZM,32,7,70.0,2,,,2,,,...,,,2.0,,,,1.0,2.0,0,0
2,ID_001728I2,71,7,22.0,2,1.0,,2,,,...,,,2.0,,,,2.0,1.0,1,0
3,ID_001R7IDN,48,3,27.0,1,,,2,,2.0,...,,,,,,2.0,1.0,1.0,1,0
4,ID_0029QKF8,25,0,79.0,2,,,2,,,...,,,2.0,,,2.0,1.0,1.0,1,0


In [93]:
# check columns information 

train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108446 entries, 0 to 108445
Data columns (total 42 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ID            108446 non-null  object 
 1   country_code  108446 non-null  int64  
 2   region        108446 non-null  int64  
 3   age           108124 non-null  float64
 4   FQ1           108446 non-null  int64  
 5   FQ2           49124 non-null   float64
 6   FQ3           46218 non-null   float64
 7   FQ4           108446 non-null  int64  
 8   FQ5           21185 non-null   float64
 9   FQ6           60659 non-null   float64
 10  FQ7           60620 non-null   float64
 11  FQ8           108446 non-null  int64  
 12  FQ9           108446 non-null  int64  
 13  FQ10          108446 non-null  int64  
 14  FQ11          83876 non-null   float64
 15  FQ12          108446 non-null  int64  
 16  FQ13          108446 non-null  int64  
 17  FQ14          108446 non-null  int64  
 18  FQ15

In [94]:
# shape of the train data

train.shape 

(108446, 42)

In [95]:
# show test data

test.head()

Unnamed: 0,ID,country_code,region,age,FQ1,FQ2,FQ3,FQ4,FQ5,FQ6,...,FQ26,FQ27,FQ28,FQ29,FQ30,FQ31,FQ32,FQ33,FQ34,FQ37
0,ID_000YI58E,39,2,22.0,2,,,2,,1.0,...,2,,,,,,2.0,1.0,1.0,0
1,ID_001SP4JF,30,2,62.0,1,,,2,,1.0,...,2,,,2.0,,1.0,1.0,1.0,1.0,0
2,ID_001VOF6S,65,4,35.0,2,1.0,,1,1.0,,...,2,,,,,,,1.0,,0
3,ID_0030LULG,123,0,24.0,2,1.0,,2,,1.0,...,2,,,2.0,,,,1.0,1.0,1
4,ID_0037PZ3R,67,2,25.0,2,,,1,,,...,2,,,1.0,,,,2.0,1.0,1


In [96]:
# shape of the test data

test.shape 

(46477, 41)

In [97]:
#check if you have any missing data in the train data

train.isnull().sum() 

ID                   0
country_code         0
region               0
age                322
FQ1                  0
FQ2              59322
FQ3              62228
FQ4                  0
FQ5              87261
FQ6              47787
FQ7              47826
FQ8                  0
FQ9                  0
FQ10                 0
FQ11             24570
FQ12                 0
FQ13                 0
FQ14                 0
FQ15                 0
FQ16                 0
FQ17             97099
FQ18                 0
FQ19             47407
FQ20             24679
FQ21             24635
FQ22                 0
FQ23                 0
FQ24             70014
FQ35             82557
FQ36             96963
FQ25                 0
FQ26                 0
FQ27            105246
FQ28            106940
FQ29             24534
FQ30            106331
FQ31            107577
FQ32             47650
FQ33                 2
FQ34             31794
FQ37                 0
Target               0
dtype: int64

In [98]:
#check if you have any missing data in the train data

train.isnull().sum() 

ID                   0
country_code         0
region               0
age                322
FQ1                  0
FQ2              59322
FQ3              62228
FQ4                  0
FQ5              87261
FQ6              47787
FQ7              47826
FQ8                  0
FQ9                  0
FQ10                 0
FQ11             24570
FQ12                 0
FQ13                 0
FQ14                 0
FQ15                 0
FQ16                 0
FQ17             97099
FQ18                 0
FQ19             47407
FQ20             24679
FQ21             24635
FQ22                 0
FQ23                 0
FQ24             70014
FQ35             82557
FQ36             96963
FQ25                 0
FQ26                 0
FQ27            105246
FQ28            106940
FQ29             24534
FQ30            106331
FQ31            107577
FQ32             47650
FQ33                 2
FQ34             31794
FQ37                 0
Target               0
dtype: int64

In [99]:
# function rate nans

def nans_rate(train,col) :
    return train[col].isna().sum() / train.shape[0]

# function to remove nans
def remove_nans(train,thresh) :
    for col in train.columns : 
        if nans_rate(train,col) >= thresh :
            train.drop(col,axis=1,inplace=True)
    return train

In [100]:
# set threshod = 0.8

train = remove_nans(train,thresh=0.8)
test = test[train.columns[:-1]]

In [101]:
#check again missing values 

train.isnull().sum() 

ID                  0
country_code        0
region              0
age               322
FQ1                 0
FQ2             59322
FQ3             62228
FQ4                 0
FQ6             47787
FQ7             47826
FQ8                 0
FQ9                 0
FQ10                0
FQ11            24570
FQ12                0
FQ13                0
FQ14                0
FQ15                0
FQ16                0
FQ18                0
FQ19            47407
FQ20            24679
FQ21            24635
FQ22                0
FQ23                0
FQ24            70014
FQ35            82557
FQ25                0
FQ26                0
FQ29            24534
FQ32            47650
FQ33                2
FQ34            31794
FQ37                0
Target              0
dtype: int64

In [102]:
# function to clean data

def clean_data(data):
    data["age"].fillna(data.age.mean(),inplace=True)
    
    FQ = data.filter(like= 'FQ').columns
    
    for column in FQ:
        data[column] = data[column].fillna(-1)
        data[column] = data[column].astype('int')
    
    return data
    

In [103]:
# clean train and test data

train = clean_data(train)
test = clean_data(test)

In [104]:
#check again missing data in train

train.isnull().sum() 

ID              0
country_code    0
region          0
age             0
FQ1             0
FQ2             0
FQ3             0
FQ4             0
FQ6             0
FQ7             0
FQ8             0
FQ9             0
FQ10            0
FQ11            0
FQ12            0
FQ13            0
FQ14            0
FQ15            0
FQ16            0
FQ18            0
FQ19            0
FQ20            0
FQ21            0
FQ22            0
FQ23            0
FQ24            0
FQ35            0
FQ25            0
FQ26            0
FQ29            0
FQ32            0
FQ33            0
FQ34            0
FQ37            0
Target          0
dtype: int64

In [105]:
#check again missing data in test

train.isnull().sum() 

ID              0
country_code    0
region          0
age             0
FQ1             0
FQ2             0
FQ3             0
FQ4             0
FQ6             0
FQ7             0
FQ8             0
FQ9             0
FQ10            0
FQ11            0
FQ12            0
FQ13            0
FQ14            0
FQ15            0
FQ16            0
FQ18            0
FQ19            0
FQ20            0
FQ21            0
FQ22            0
FQ23            0
FQ24            0
FQ35            0
FQ25            0
FQ26            0
FQ29            0
FQ32            0
FQ33            0
FQ34            0
FQ37            0
Target          0
dtype: int64

In [106]:
#remove ID and country_code on both train and test 

train = train.drop(['ID','country_code'],axis=1)
test = test.drop(['ID','country_code'],axis=1)

In [107]:
#split train dada into target and feature

X = train.drop(['Target'],axis=1)
y = train.Target

In [108]:
# process the data 
scaler = StandardScaler() 

scaler.fit(X)

#transform train set 
X_scaled = scaler.transform(X)


#transform test set 
test_scaled = scaler.transform(test)

In [109]:
# split train data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X_scaled,
    y,
    test_size=0.20,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [110]:
# Train classifier 

classifier = RandomForestClassifier()

classifier.fit(X_train,y_train)

RandomForestClassifier()

In [112]:
# measure classifier performance
roc_auc_score(y_valid, classifier.predict_proba(X_valid)[:, 1])

0.49915395859065026

In [113]:
# make prediction on the test set 
test_preds = classifier.predict_proba(test)[:, 1]

In [114]:
# create submission dataframe
Submission = pd.DataFrame({'ID' :submission.ID ,'Target' :test_preds})

In [115]:
# show sample of the submission data
Submission.head()

Unnamed: 0,ID,Target
0,ID_000YI58E,0.44
1,ID_001SP4JF,0.43
2,ID_001VOF6S,0.48
3,ID_0030LULG,0.39
4,ID_0037PZ3R,0.45


In [116]:
# save submission into scv file
Submission.to_csv('data/first_submission.csv',index=False)