# Hey Kagglers!! Today I am going to walk you through a binary classification problem in simple and clear steps

In [None]:
from IPython import display
display.Image("../input/classification-cover/classification_cover.png", width=1400,height=600,)

**First lets get our tools ready**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix , precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

In [None]:
test = test_data.copy()  #lets just keep a safe version

In [None]:
train_data

In [None]:
test_data

In [None]:
all_data = pd.concat([train_data , test_data]) 
all_data    #not recommended :)

**I created this dataframe "all_data" inorder to easily make the operations on the features (Xs) {ex:Drop, replace, rename} at once for the test and train data sets, But we must be cautious about data leakage!**

> **train_data = all_data.iloc[0:300000, :]**             
> **test_data = all_data.iloc[300000 : , :].drop(['target'],axis=1)**

# **Data Preparation**

### **Data preparation is a very very important phase of any ML project, we can basically divide this phase into two parts : EDA & preprocessing**

>#  Exploratory Data Analysis - EDA

>**At this level our approach is analyzing the data set to summerize it's main characteristics and see if we have any alarming flags**

In [None]:
display.Image("../input/explore/explore.jpeg",width=650, height=500) 

In [None]:
train_data.shape

In [None]:
train_data.head()

In [None]:
# id column is useless for now !
all_data.drop('id',axis=1,inplace=True)
train_data.drop('id',axis=1,inplace=True)
test_data.drop('id',axis=1,inplace=True)


In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

**There are 300k data point and no missing values !**

Lets the see the unique categories in each of the categorical columns 

In [None]:
for i in range (18):
    print("category{}".format(i) , train_data["cat{}".format(i)].unique() ,"\n") 


We can see that some categorical columns (like cat10,cat5) have so many categories, this wiil cause a huge increase in the number of columns if we decided to use One Hot Encoding ! 

In [None]:
len(train_data["cat10"].unique())

Note: I tried to drop "Cat10" since it had so many features and the distribution was so bad (one feature had most of the frequency) but the accuracy of the model actually decrease a bit so I undropped it

Lets do a quick summary statistic for numerical data

In [None]:
train_data.describe()   #only for train data !

In [None]:
train_data['target'].value_counts()

As we can see from the target label, This is a binary classification problem  .... Also we can see that there is a class inbalance between the positive and negative classes ! , we might what to handel that by something like class_weights

let's plot a bargraph fot the target variable

In [None]:
train_data["target"].value_counts().plot(kind='bar',color='red') 

**We can always use a heat map to find explore the correlation between features**

In [None]:
corr_matrix = train_data.corr()   # will only work for continuous numerical data (pearson's correlation)
plt.figure(figsize = (10,10))
sns.heatmap(corr_matrix,xticklabels=corr_matrix.columns.values,yticklabels=corr_matrix.columns.values,annot = True)

In [None]:
correlation_with_target=corr_matrix['target']  #absolute value to see feature importance regardless of sign
correlation_with_target.abs().sort_values(ascending = False)

**Let's Seperate categorical and numerical columns**

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']  #numeric data types
num_columns = [col for col in all_data.columns if (all_data[col].dtype in numerics) and (col != "target") ]
num_columns

In [None]:
cat_columns = [col for col in all_data.columns if (all_data[col].dtype not in numerics)]
cat_columns

***Frequency distribution for numeric columns***

In [None]:
for i in range (train_data[num_columns].shape[1]):
    plt.figure()
    plt.hist(train_data[num_columns].iloc[:,i])
    plt.xlabel(train_data[num_columns].columns[i])
    plt.ylabel('frequency')

***Frequency distribution for categorical columns***

In [None]:
for i in range (train_data[cat_columns].shape[1]):
    plt.figure()
    plt.hist(train_data[cat_columns].iloc[:,i])
    plt.xlabel(train_data[cat_columns].columns[i])
    plt.ylabel('frequency')

****Check out this fancy visualization !****

In [None]:
#thanks to @ANDRESHG

num_rows, num_cols = len(num_columns),2
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 15))
f.suptitle('Distribution of Features', fontsize=16)

for index, column in enumerate(num_columns):
    i,j = (index // num_cols, index % num_cols)
    sns.kdeplot(train_data.loc[train_data['target'] == 0, column], color="r", shade=True, ax=axes[index,0])
    sns.kdeplot(train_data.loc[train_data['target'] == 1, column], color="g", shade=True, ax=axes[index,0])
    sns.histplot(data=train_data,x=column,hue='target', kde=False, palette='Paired_r', bins=10, ax=axes[index,1],multiple='stack')

# **Data Preprocessing**

In [None]:
display.Image("../input/cleaning/datapreproc.png",width=500, height=500)

*luckily, the data doesn't need any cleaning*

### **Let's check for multicollinearity between independent variables**

*Actually we already saw the correlation between the independent variables in the heat map before ;)*

In [None]:
corr_matrix[corr_matrix>0.8][corr_matrix !=1].fillna("OK")

**cont1 is highley correlated to cont2 and cont0 is highley correlated cont10 .... is that a big deal ?**                     
>**Acually for some algorithms, yes it is; You see, some algorithms like logestic regression (which we will use in a moment) assumes the absence of multicollinearity           
So multicollinearity missleads the model into inflating the affect of those correlated features (note: checkout VIF "quantifies the severity of multicollinearity")            
Fair to say that some other models like tree-based models are not affected by this multicollinearity**

**Lets fix this using some basic feature engineering!**

In [None]:
all_data['cont1_2'] = all_data['cont1'] * all_data['cont2']
all_data['cont0_10'] = all_data['cont0'] * all_data['cont10']    
all_data.drop('cont1',axis=1,inplace=True)
all_data.drop('cont2',axis=1,inplace=True)
all_data.drop('cont0',axis=1,inplace=True)
all_data.drop('cont10',axis=1,inplace=True)
num_columns = [col for col in all_data.columns if (all_data[col].dtype in numerics) and (col != "target") ]
cat_columns = [col for col in all_data.columns if (all_data[col].dtype not in numerics)]

In [None]:
t = train_data.shape[0]
train_data = all_data.iloc[0:t, :]

**We could also get a principal component that finds an axis that explains most of the variance, and then we will used this new component with dropping one of the old features**

That process will look like this :

***====Just for demonstration====***

In [None]:
# pca = PCA( n_components =1)
# cont1_2 = pca.fit_transform(all_data[['cont1','cont2']])
# print(pca.explained_variance_ratio_)
##From the explained variance ratio we can basically have an intuition about the percentage of lost information after transformation

In [None]:
# all_data["cont1_2"] = cont1_2
# all_data.drop('cont1',axis=1,inplace=True)
# all_data.drop('cont2',axis=1,inplace=True)

# we have already delt with the correlated features

In [None]:
# pca1 = PCA( n_components =1)
# cont0_10 = pca1.fit_transform(all_data[['cont0','cont10']])
# print(pca1.explained_variance_ratio_) 

In [None]:
# all_data["cont0_10"] = cont0_10
# all_data.drop('cont10',axis=1,inplace=True)
# all_data.drop('cont0',axis=1,inplace=True)
# all_data.drop('cat10',axis=1,inplace=True)

# we have already delt with the correlated features

**===================**

**Let's check the correlation between features now after we delt with the correlated features !**

In [None]:
corr_matrix_after = train_data.corr() 
corr_matrix_after[corr_matrix>0.8][corr_matrix !=1].fillna("OK")

**That's Looking Better !!**

In [None]:
train_data_x = all_data.iloc[0:t, :].drop(['target'],axis=1)
test_data = all_data.iloc[t : , :].drop(['target'],axis=1)     

In [None]:
train_data_x.head()

In [None]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        #('std_scaler', StandardScaler()),
    ])
#in this problem ... there is no missing data, so we dont need an imputer, this is what a typical pipeline will look like tho
#we will comment the scaler since tree-base models(which we will be using alot of) doesn't require standarization

In [None]:
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown ='ignore')),
    ])
#Lets start with one hot encoding

In [None]:
num_attribs = num_columns
cat_attribs = cat_columns

full_pipeline = ColumnTransformer([
        ("numerical", num_pipeline, num_attribs),      
        ("categorical", cat_pipeline, cat_attribs)],
        
    )

In [None]:
train_data_x_prep = full_pipeline.fit_transform(train_data_x)   #fit_transform for training data

In [None]:
test_data_prep = full_pipeline.transform(test_data)   #transform only for training data !

In [None]:
train_data_x_prep.shape

In [None]:
test_data_prep.shape

In [None]:
train_data.target

Data is ready

**Now, Lets get started with the model itself !**

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(train_data.target)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_data.target)
class_weights1 = dict(zip(classes, weights))
#This is to be used in Catboost, in LGBM for example we can just type class_weight ='balanced' 

**The idea behined StratifiedKFold is just to ensure that the dirtribution of classes(the ratios) in each fold is the same as in the entire data set**                          
**So basically if the ratio of class A to B is 0.4 in the data-set,StratifiedKFold will ensure the ratio between them to be 0.4 in each of the folds**

# Baseline Model

In [None]:
display.Image("../input/baseline/baseline.png")

**We just want to create a initial point of refrence to be sure that the problem is solvable and that we are making progress in the next steps**

## Logestic Regression

Lets start by creating a simple Logestic Regression model 

In [None]:
lr = LogisticRegression(max_iter=700,class_weight='balanced',n_jobs=-1)
cv = cross_val_score(lr ,train_data_x_prep,train_data.target,cv=skf,scoring='roc_auc')
cv, cv.mean()

Good start out of the box, lets see if we can do any better !

# Upscaling: Develop a model that overfits !

In [None]:
display.Image("../input/overfit/overfit.png")

In [None]:
dt = DecisionTreeClassifier()
dt.fit(train_data_x_prep,train_data.target)
dt.predict(train_data_x_prep) 
dt.score(train_data_x_prep,train_data.target)

100% score !! thats great ! ...right?  actually NO, the DT classifier of sklearn by defalut has "max_depth=None" which means that it will gladly continue to make splits and more leafs until it totally overfits the data !  want a prove?   Lets test the model using the validation error (Train the model on parts of the data and the test on another part that it didn't see before)

In [None]:
dt = DecisionTreeClassifier()
cv = cross_val_score(dt ,train_data_x_prep,train_data.target,cv=skf)
cv, cv.mean()

This huge dump in performance is due to the model overfitting the folds it gets for training each time and performes poorly on the hidden fold 

**We can do better !**

# Developing models that performs better than the baseline

## Random Forest

Random Forest is basicly a bootstraping aggregation of diffrent DTs, essentially we get a bunch of DTs that are trained on diffrent splits(with resampling) of the data and even diffrent sets of features. This form of ensembling really helps in generalizing the model (regularization) ... Lets see that in action and see how RF will out perform a single DT in the validation errors 

In [None]:
rf = RandomForestClassifier(n_estimators= 300,n_jobs=-1 ,class_weight = 'balanced')
cv = cross_val_score(rf ,train_data_x_prep,train_data.target,cv=skf,scoring='roc_auc')
cv, cv.mean()

Way better !

In [None]:
display.Image("../input/lightgbm/lightgbm.png",width=300,height=300)

Now lets try Microsoft's LightGBM and see how it compares

In [None]:
lbg = LGBMClassifier(n_estimators= 110,num_leaves = 300)
cv = cross_val_score(lbg ,train_data_x_prep,train_data.target,cv=skf,scoring='roc_auc')
cv, cv.mean()

That's good, Lets see if we can do any better with hyperparameter tuning using a grid search !

In [None]:
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ',classifier.best_score_)
    print('Best Parameters: ' , classifier.best_params_)
    

In [None]:
n_estimators = [90,100,110,120]
num_leaves = [250,300, 350]
param_grid = {'n_estimators': n_estimators,'num_leaves' :num_leaves }

The grid search will try all the possible compinations and come back with the best performer !

In [None]:
lbg = LGBMClassifier(class_weight = 'balanced')
clf_lbg = GridSearchCV(lbg, param_grid = param_grid,  verbose = 0,scoring ='roc_auc', n_jobs = -1,cv=None)
best_clf_lbg = clf_lbg.fit(train_data_x_prep,train_data.target) 
clf_performance(best_clf_lbg,'LGBM')

So the best combination of the parameters within this search is n_estimators= 90 & num_leaves = 250

In [None]:
# lbg = LGBMClassifier(n_estimators= 90,num_leaves = 250,n_jobs=-1 ,class_weight = 'balanced' )
cv = cross_val_score(best_clf_lbg ,train_data_x_prep,train_data.target,cv=skf,scoring='roc_auc')
cv, cv.mean() 

That's good from LGBM , lets try another competitor !

In [None]:
display.Image("../input/catboost/catboost.png",width=300,height=300)

In [None]:
cat = CatBoostClassifier(verbose=False,class_weights = class_weights1,iterations = 10000)
cv = cross_val_score(cat,train_data_x_prep,train_data.target,cv=skf,scoring='roc_auc')
cv, cv.mean()

Wow, just by increasing the number of iterations, we get amazing results !  Looks like we have a winner !!

**note: in a case like this I would use something like a voting classifier, but since the required output of this problem is probability rather than actual results ... lets stop here and go evaluate our output**

In [None]:
cat.fit(train_data_x_prep,train_data.target)

In [None]:
prob_pred = cat.predict_proba(test_data_prep)

In [None]:
prob_pred=prob_pred[:,1]

In [None]:
#In this specific dataset, the output is required in the form of probabilities
output = pd.DataFrame({'id':test.id, 'target': prob_pred})
output.to_csv('Osama_new.csv', index=False)
output

In [None]:
output.shape

Lastly, let’s check our model's detailed performance (TP,FP,TN,FN)  from which we can interpret many important metrics as accuracy, recall and precision

In [None]:
cm = confusion_matrix(train_data.target,cat.predict(train_data_x_prep)) 
sns.set(font_scale = 1)
# Transform to df for easier plotting
cm_df = pd.DataFrame(cm)

plt.figure(figsize=(6,4.5))
sns.heatmap(cm_df, annot=True, fmt='g')
# plt.title('SVM Linear Kernel \nAccuracy:{0:.3f}'.format(y_df_train, lr.predict(train)))
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

**Lets see how we did !**

In [None]:
display.Image("../input/score/score.jpg")

Not bad !

**I really hope you have enjoyed this notebook, feel free to leave any comments**

In [None]:
display.Image("../input/thanks/thanks.jpg")