<p style="background-color:#BCDBF9;color:black;font-size:25px;text-align:center;border-radius:10px 10px;font-weight:bold;">Tabular Playground Series - Nov 2021 ⚡</p>

<center><img src="https://media.giphy.com/media/xT9C25UNTwfZuk85WP/giphy-downsized-large.gif"></center>

<a id="section-one"></a>

##  <span style='font-size:22px;'>&#128311;</span>  Introduction

<font size="4">This dataset is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting identifying spam emails via various extracted features from the email. Although the features are anonymized, they have properties relating to real-world features. The dataset has 100 features and the response variable is a binary variable.</font>


In [None]:
#importing librariaes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')

print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train=train.drop(['id'],1)
test=test.drop(['id'],1)

In [None]:
from collections import Counter
print(sorted(Counter(train['target']).items()))
sns.countplot(train['target'],palette='OrRd')

In [None]:
sns.heatmap(train.corr())

#### The varaibles are not correlated. all the values are very low.

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False)

####  No missing values.

#### distributions before transformation

In [None]:
fig, axes = plt.subplots(10,10, figsize=(18, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    
    sns.kdeplot(
        data=train, ax=ax, hue='target', fill=True,
        x=f'f{idx}', palette=['blue', 'red'], legend=idx==0
    )
 
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel('')
    ax.set_ylabel(''); ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right',fontsize=10)

fig.supxlabel('Probability distribution', ha='center')
fig.tight_layout()
plt.show()

#### Before transforming data I will remove the target variable.

In [None]:
X=train.drop(['target'],1)
y=train['target']
test_x=test

In [None]:
#dividing the dataset into train and test sets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1234)

<a id="section-two"></a>

##  <span style='font-size:22px;'>&#127917;</span>  Data transformation

#### Quantile transforms are a technique for transforming numerical input or output variables to have a Gaussian or uniform probability distribution. here I am using normal quantile transformation.

In [None]:
from sklearn.preprocessing import QuantileTransformer
trans = QuantileTransformer(n_quantiles=100, output_distribution='normal')
trans.fit(x_train)
x_train_transformed = trans.transform(x_train)
x_test_transformed = trans.transform(x_test)
test_transformedf = trans.transform(test_x)

In [None]:
x_train_final = pd.DataFrame(x_train_transformed, columns = x_train.columns)
x_test_final = pd.DataFrame(x_test_transformed, columns = x_test.columns)
test_final=  pd.DataFrame(test_transformedf, columns = test.columns)

In [None]:
x_train_final['target']=y_train

#### distributions after transformations.

In [None]:
fig, axes = plt.subplots(10,10, figsize=(18, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    
    sns.kdeplot(
        data=x_train_final, ax=ax, hue='target', fill=True,
        x=f'f{idx}', palette=['blue', 'red'], legend=idx==0
    )
 
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel('')
    ax.set_ylabel(''); ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', fontsize=11)

fig.supxlabel('Probability distribution', ha='center')
fig.tight_layout()
plt.show()

In [None]:
x_train_final.head()

In [None]:
x= x_train_final.drop(['target'],1)
y= y_train

#### This will be a very time consuming since the trainset is large. So I will use a sample from the train set randomly and check the accuracies. this will not reduce accuracies that much.

In [None]:
print(x.shape)
print(x_test.shape)

In [None]:
x_train1,x_test1,y_train1,y_test1=train_test_split(x,y,test_size=0.80,random_state=1234)

In [None]:
print(x_train1.shape)

<a id="section-three"></a>

##  <span style='font-size:22px;'>&#8987;</span> Model fitting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

#libraries for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [None]:
from sklearn.linear_model import RidgeClassifier
rc =RidgeClassifier()
model0=rc.fit(x_train1, y_train1)
print("train accuracy:",model0.score(x_train1, y_train1),"\n","test accuracy:",model0.score(x_test_final,y_test))
rcpred = rc.predict(x_test_final)
print("\n")
print("classification report for logistic regression")
print(classification_report(rcpred,y_test))
print("\n")
print("confusion matrix for logistic regression")
displr = plot_confusion_matrix(rc, x_test_final, y_test,cmap=plt.cm.OrRd , values_format='d')

In [None]:
#logistic regression
lr = LogisticRegression(max_iter=20000,penalty='l2')
model1=lr.fit(x_train1, y_train1)
print("train accuracy:",model1.score(x_train1, y_train1),"\n","test accuracy:",model1.score(x_test_final,y_test))
lrpred = lr.predict(x_test_final)
print("\n")
print("classification report for logistic regression")
print(classification_report(lrpred,y_test))
print("\n")
print("confusion matrix for logistic regression")
displr = plot_confusion_matrix(lr, x_test_final, y_test,cmap=plt.cm.OrRd , values_format='d')

In [None]:
#linear discriminant analysis
lda = LinearDiscriminantAnalysis()
model2=lda.fit(x_train1, y_train1)
print("train accuracy:",model2.score(x_train1, y_train1),"\n","test accuracy:",model2.score(x_test_final,y_test))

ldapred = lda.predict(x_test_final)
print("\n")
print("classification report for linear discriminant analysis")
print(classification_report(ldapred,y_test))
print("\n")
print("confusion matrix for linear discriminant analysis")
displr = plot_confusion_matrix(lda, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
#decision tree classifier
dt=DecisionTreeClassifier()
model3=dt.fit(x_train1, y_train1)
print("train accuracy:",model3.score(x_train1, y_train1),"\n","test accuracy:",model3.score(x_test_final,y_test))

dtpred = dt.predict(x_test_final)
print("\n")
print("classification report for decision tree classifier")
print(classification_report(dtpred,y_test))
print("\n")
print("confusion matrix for decision tree classifier")
displr = plot_confusion_matrix(dt, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
#random forest classifier
rf=RandomForestClassifier()
model4=rf.fit(x_train1, y_train1)
print("train accuracy:",model4.score(x_train1, y_train1),"\n","test accuracy:",model4.score(x_test_final,y_test))

rfpred = rf.predict(x_test_final)
print("\n")
print("classification report for random forest classifier")
print(classification_report(rfpred,y_test))
print("\n")
print("confusion matrix for random forest classifier")
displr = plot_confusion_matrix(rf, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
#bagging classifier
bg=BaggingClassifier()
model5=bg.fit(x_train1, y_train1)
print("train accuracy:",model5.score(x_train1, y_train1),"\n","test accuracy:",model5.score(x_test_final,y_test))

bgpred = bg.predict(x_test_final)
print("\n")
print("classification report for bagging classifier")
print(classification_report(bgpred,y_test))
print("\n")
print("confusion matrix for bagging classifier")
displr = plot_confusion_matrix(bg, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

<center><img src="https://media.giphy.com/media/3jmqIaBE8x86xwbSbD/giphy.gif"></center>

In [None]:
# gradient boost classifier 
gbm=GradientBoostingClassifier()
model6=gbm.fit(x_train1, y_train1)
print("train accuracy:",model6.score(x_train1, y_train1),"\n","test accuracy:",model6.score(x_test_final,y_test))

gbmpred = gbm.predict(x_test_final)
print("\n")
print("classification report for gradient boosting classifier")
print(classification_report(gbmpred,y_test))
print("\n")
print("confusion matrix for gradient boosting classifier")
displr = plot_confusion_matrix(gbm, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
# adaboost classifier 
ada=AdaBoostClassifier()
model7=ada.fit(x_train1, y_train1)
print("train accuracy:",model7.score(x_train1, y_train1),"\n","test accuracy:",model7.score(x_test_final,y_test))

adapred = ada.predict(x_test_final)
print("\n")
print("classification report for adaboost classifier")
print(classification_report(adapred,y_test))
print("\n")
print("confusion matrix for adaboost classifier")
displr = plot_confusion_matrix(ada,x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
# extreme gradient boost classifier
xgb = XGBClassifier()
model8=xgb.fit(x_train1, y_train1)
print("train accuracy:",model8.score(x_train1, y_train1),"\n","test accuracy:",model8.score(x_test_final,y_test))

xgbpred = xgb.predict(x_test_final)
print("\n")
print("classification report for extreme gradient boosting classifier")
print(classification_report(xgbpred,y_test))
print("\n")
print("confusion matrix for extreme gradient boosting classifier")
displr = plot_confusion_matrix(xgb, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
# extra tree classifier
extree = ExtraTreesClassifier()
model9=extree.fit(x_train1, y_train1)
print("train accuracy:",model9.score(x_train1, y_train1),"\n","test accuracy:",model9.score(x_test_final,y_test))

extpred = extree.predict(x_test_final)
print("\n")
print("classification report for extra tree classifier")
print(classification_report(extpred,y_test))
print("\n")
print("confusion matrix for extra tree classifier")
displr = plot_confusion_matrix(extree, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
# voting classifer
from sklearn.ensemble import VotingClassifier
clf1 = LinearDiscriminantAnalysis()
clf2 = LogisticRegression(max_iter=20000,penalty='l2')

vc = VotingClassifier(estimators=[('ext', clf1),('lr', clf2)], voting='soft')
model10=vc.fit(x_train1, y_train1)
print("train accuracy:",model10.score(x_train1, y_train1),"\n","test accuracy:",model10.score(x_test_final,y_test))

vcpred = vc.predict(x_test_final)
print("\n")
print("classification report for voting classifier")
print(classification_report(vcpred,y_test))
print("\n")
print("confusion matrix for voting classifier")
displr = plot_confusion_matrix(vc, x_test_final, y_test ,cmap=plt.cm.OrRd, values_format='d')

In [None]:
# stacking classifier 
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

estimators = [('rf',make_pipeline(StandardScaler(),LinearDiscriminantAnalysis()) ),('ext', make_pipeline(StandardScaler(),LogisticRegression(max_iter=20000,penalty='l2')))]
sc= StackingClassifier( estimators=estimators)

model11=sc.fit(x_train1, y_train1)
print("train accuracy:",model11.score(x_train1, y_train1),"\n","test accuracy:",model11.score(x_test_final,y_test))

scpred = sc.predict(x_test_final)
print("\n")
print("classification report for voting classifier")
print(classification_report(scpred,y_test))
print("\n")
print("confusion matrix for voting classifier")
displr = plot_confusion_matrix(sc, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
from catboost import CatBoostClassifier

cc = CatBoostClassifier(silent=True )
model12=cc.fit(x_train1, y_train1)
print("train accuracy:",model12.score(x_train1, y_train1),"\n","test accuracy:",model12.score(x_test_final,y_test))

ccpred = cc.predict(x_test_final)
print("\n")
print("classification report for extra tree classifier")
print(classification_report(ccpred,y_test))
print("\n")
print("confusion matrix for extra tree classifier")
displr = plot_confusion_matrix(cc, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

#### since catboost classifier has the highest accuracy.let's check that with original training set.

In [None]:
ccf = CatBoostClassifier(silent=True )
model13=ccf.fit(x, y_train)
print("train accuracy:",model13.score(x, y_train),"\n","test accuracy:",model13.score(x_test_final,y_test))

ccfpred = ccf.predict(x_test_final)
print("\n")
print("classification report for extra tree classifier")
print(classification_report(ccfpred,y_test))
print("\n")
print("confusion matrix for extra tree classifier")
displr = plot_confusion_matrix(ccf, x_test_final, y_test ,cmap=plt.cm.OrRd , values_format='d')

<a id="section-four"></a>

##  <span style='font-size:22px;'>&#128202;</span>  Feature Importance plot

#### Feature importance plot from the catboost classifier model.

In [None]:
def plot_feature_importance(importance,names,model_type):    

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    #Define size of bar plot
    plt.figure(figsize=(18,60))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE', fontsize=22)
    plt.ylabel('FEATURE NAMES', fontsize=22)
    
plot_feature_importance(ccf.get_feature_importance(),x.columns,'CATBOOST')

#### Now let's fit the model with only important variables.

In [None]:
cols=['f34','f55','f8','f43','f91','f71','f80','f27','f50','f97','f41','f57','f22','f25','f66','f96','f81','f82','f21','f24','f26','f54',
     'f60','f95','f20','f40','f98','f9','f33','f53','f94','f31','f17','f51','f3','f16','f62','f44','f30','f42','f5','f49','f48','f64',
     'f2','f56','f32','f93','f68','f4','f87','f10','f61','f1','f47','f75','f23','f84','f58','f70','f76','f83','f36','f99','f78','f88']

In [None]:
df = x.filter(cols)
#df_test=test_transformedf.filter(cols)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
x_train_imp,x_test_imp,y_train_imp,y_test_imp=train_test_split(df,y,test_size=0.20,random_state=1234)

In [None]:
ccimp = CatBoostClassifier(silent=True )
model14=ccimp.fit(x_train_imp, y_train_imp)
print("train accuracy:",model14.score(x_train_imp, y_train_imp),"\n","test accuracy:",model14.score(x_test_imp,y_test_imp))

ccimppred = ccimp.predict(x_test_imp)
print("\n")
print("classification report for extra tree classifier")
print(classification_report(ccimppred,y_test_imp))
print("\n")
print("confusion matrix for extra tree classifier")
displr = plot_confusion_matrix(ccimp, x_test_imp, y_test_imp ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
from sklearn.linear_model import RidgeClassifier
rcimp =RidgeClassifier()
model15=rcimp.fit(x_train_imp, y_train_imp)
print("train accuracy:",model15.score(x_train_imp, y_train_imp),"\n","test accuracy:",model15.score(x_test_imp,y_test_imp))
rcimppred = rcimp.predict(x_test_imp)
print("\n")
print("classification report for logistic regression")
print(classification_report(rcimppred,y_test_imp))
print("\n")
print("confusion matrix for logistic regression")
displr = plot_confusion_matrix(rcimp, x_test_imp, y_test_imp,cmap=plt.cm.OrRd , values_format='d')

In [None]:
# stacking classifier 
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

estimators = [('rf',make_pipeline(StandardScaler(),LinearDiscriminantAnalysis()) ),('ext', make_pipeline(StandardScaler(),LogisticRegression(max_iter=20000,penalty='l2')))]
scimp= StackingClassifier( estimators=estimators)

model16=scimp.fit(x_train_imp, y_train_imp)
print("train accuracy:",model16.score(x_train_imp, y_train_imp),"\n","test accuracy:",model16.score(x_test_imp,y_test_imp))

scimppred = scimp.predict(x_test_imp)
print("\n")
print("classification report for stacking classifier")
print(classification_report(scimppred,y_test_imp))
print("\n")
print("confusion matrix for stacking classifier")
displr = plot_confusion_matrix(scimp, x_test_imp, y_test_imp ,cmap=plt.cm.OrRd , values_format='d')

In [None]:
# voting classifer
from sklearn.ensemble import VotingClassifier
clf1 = LinearDiscriminantAnalysis()
clf2 = LogisticRegression(max_iter=20000,penalty='l2')

vcimp = VotingClassifier(estimators=[('lda', clf1),('lr', clf2)], voting='soft')
model17=vcimp.fit(x_train_imp, y_train_imp)
print("train accuracy:",model17.score(x_train_imp, y_train_imp),"\n","test accuracy:",model17.score(x_test_imp,y_test_imp))

vcimppred = vcimp.predict(x_test_imp)
print("\n")
print("classification report for voting classifier")
print(classification_report(vcimppred,y_test_imp))
print("\n")
print("confusion matrix for voting classifier")
displr = plot_confusion_matrix(vcimp, x_test_imp, y_test_imp ,cmap=plt.cm.OrRd, values_format='d')

##  <span style='font-size:22px;'>&#128142;</span>  Results

<font size="4"> Feature selection has helped to reduce over fitting but the expected performance coud not found. The highest accuracy was getting by <span style="color:red;">linear models and catboost classifiers</span>. The highest accuracy achieved is <span style="color:red;">71.65%</span></font>


<center><img src="https://media.giphy.com/media/j1Xyt3DHfJcmk/giphy.gif"></center>

<font size="4"><b> Any suggestions,questions or feedback regarding this notebook are highly appreciated. THANK YOU <span style='font-size:22px;'>&#128522;</span> <b></font>

<font size="5"> </font>