# Getting Started with Tabular Playground Series - Apr 2021

# <img src="https://thumbor.forbes.com/thumbor/960x0/https%3A%2F%2Fspecials-images.forbesimg.com%2Fdam%2Fimageserve%2F877330410%2F960x0.jpg%3Ffit%3Dscale">

1. # Importing Python Libraries 📕 📗 📘 📙

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report
from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

<div class="alert alert-block alert-danger">  
    <h1><strong>Loading training data</strong></h1>
    <i></i>
</div>

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")

# Exploratory data analysis of train data

# Five top records of data

In [None]:
train_data.head()

# Five last records of data

In [None]:
train_data.tail()

# Coloumns/features in data

In [None]:
train_data.columns

# Length of data

In [None]:
print('lenght of data is', len(train_data))

# Shape of data

In [None]:
train_data.shape

# Data information

In [None]:
train_data.info()

# Data types of all coloumns

In [None]:
train_data.dtypes

# Checking missing Values

In [None]:
train_data[train_data.isnull().any(axis=1)].head()

# Count of missing values

In [None]:
np.sum(train_data.isnull().any(axis=1))

# Is there any missing values?

In [None]:
train_data.isnull().values.any()

# Counts of missing values in each column

In [None]:
train_data.isnull().sum()

# Looking at the train data missing values.

In [None]:
NANColumns=[]
i=-1
for a in train_data.isnull().sum():
    i+=1
    if a!=0:
        print(train_data.columns[i],a)
        NANColumns.append(train_data.columns[i])

# Frequency Distribution of pclass

In [None]:
carrier_count = train_data["Pclass"].value_counts()
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of pclass')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('pclass', fontsize=12)
plt.show()

In [None]:
train_data["Pclass"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of survived

In [None]:
carrier_count = train_data["Survived"].value_counts()
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of survived    ')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('survived    ', fontsize=12)
plt.show()

In [None]:
train_data["Survived"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of sex

In [None]:
train_data["Sex"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of top 10 age

In [None]:
train_data["Age"].value_counts().head(10).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of embarked

In [None]:
train_data["Embarked"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# All features of train data distrubution 

In [None]:
train_data.hist(figsize=(15,12),bins = 20, color="#107009AA")
plt.title("Features Distribution")
plt.show()

<div class="alert alert-block alert-danger">  
    <h1><strong>Loading testing data</strong></h1>
    <i></i>
</div>

In [None]:
test_data = pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
ids_test_data = test_data['PassengerId'].values

# Exploratory data analysis of test data

# Five top records of data

In [None]:
test_data.head()

# Five last records of data

In [None]:
test_data.tail()

# Coloumns/features in data

In [None]:
test_data.columns

# Length of data

In [None]:
print('lenght of data is', len(test_data))

# Shape of data

In [None]:
test_data.shape

# Data information

In [None]:
test_data.info()

# Data types of all coloumns

In [None]:
test_data.dtypes

# Checking missing Values

In [None]:
test_data[test_data.isnull().any(axis=1)].head()

# Count of missing values

In [None]:
np.sum(test_data.isnull().any(axis=1))

# Is there any missing values?

In [None]:
test_data.isnull().values.any()

# Counts of missing values in each column

In [None]:
test_data.isnull().sum()

# Looking at the test data missing values.

In [None]:
NANColumns=[]
i=-1
for a in test_data.isnull().sum():
    i+=1
    if a!=0:
        print(test_data.columns[i],a)
        NANColumns.append(test_data.columns[i])

# Frequency Distribution of pclass

In [None]:
carrier_count = test_data["Pclass"].value_counts()
sns.set(style="darkgrid")
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of pclass')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('pclass', fontsize=12)
plt.show()

In [None]:
test_data["Pclass"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of sex

In [None]:
test_data["Sex"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of top 10 age

In [None]:
test_data["Age"].value_counts().head(10).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Frequency Distribution of embarked

In [None]:
test_data["Embarked"].value_counts().head(7).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# All features of test data distrubution 

In [None]:
test_data.hist(figsize=(15,12),bins = 20, color="#107009AA")
plt.title("Features Distribution")
plt.show()

# Looking at correlated features with Survived 

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train_data.corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

<div class="alert alert-block alert-danger">  
<h2><center><strong>As we can see from the graphs, features has good correlation with Pclass</strong></center></h2>
        
</div>

# Correlation Survived with Pclass

In [None]:
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

- We can see that the correlation of pclass with survived is more than 0.5 among Pclass=1 so we are going to add this feature in training

# Correlation Survived with SEX

In [None]:
train_data[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

- We can see that the correlation of Sex with survived is more than 0.5 among Sex=female so we are going to add this feature in training

# Correlation Survived with SibSp

In [None]:
train_data[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

- We can see that the siblling with 1 is high correlated with survival but others are lower and zero

# Correlation Survived with Parch

In [None]:
train_data[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

- We can see that the Parch with 1 and 2 is high correlated with survival but others are lower and zero

# Age plot

In [None]:
g = sns.FacetGrid(train_data, col='Survived')
g.map(plt.hist, 'Age', bins=20)

- As we can see that most of old age peoples not survived

# Pclass plot

In [None]:
grid = sns.FacetGrid(train_data, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

- Pclass=3 had most passengers, however most did not survive.
- Infant passengers in Pclass=2 and Pclass=3 mostly survived. 
- Most passengers in Pclass=1 survived. 
- Pclass varies in terms of Age distribution of passengers.

# Embarked plot

In [None]:
grid = sns.FacetGrid(train_data, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

- Higher fare paying passengers had better survival.
- Port of embarkation correlates with survival rates. 

<div class="alert alert-block alert-info">  
<h2><center><strong>Features engineering and preparation</strong></center></h2>
        
</div>

## Extract the Survived out from the train data

In [None]:
y = train_data["Survived"]

## Combining the train and test dataset

In [None]:
all_data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)

## Drop the Survived & PassengerId  columns

In [None]:
all_data = all_data.drop(["Survived","PassengerId"],axis=1)

## A function for checking the missing values

In [None]:
def missing_value(df):
    number = df.isnull().sum().sort_values(ascending=False)
    number = number[number > 0]
    percentage = df.isnull().sum() *100 / df.shape[0]
    percentage = percentage[percentage > 0].sort_values(ascending=False)
    return  pd.concat([number,percentage],keys=["Total","Percentage"],axis=1)
missing_value(all_data)

## Imputing the Missing Values of all data

### int = numrical features 
### object = categorical features 

In [None]:
## Imputing the missing values with the Mode because mode fill the values with the most accuring values and best for the categorical features
all_data["Cabin"] = all_data["Cabin"].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
## Imputing the missing values with the Mode because mode fill the values with the most accuring values and best for the categorical features
all_data["Embarked"] = all_data["Embarked"].transform(lambda x: x.fillna(x.mode()[0]))

In [None]:
#Mapping the Age into 5 groups from 0 to 4
all_data['Age']=all_data.loc[ all_data['Age'] <= 16, 'Age'] = 0
all_data['Age']=all_data.loc[(all_data['Age'] > 16) & (all_data['Age'] <= 32), 'Age'] = 1
all_data['Age']=all_data.loc[(all_data['Age'] > 32) & (all_data['Age'] <= 48), 'Age'] = 2
all_data['Age']=all_data.loc[(all_data['Age'] > 48) & (all_data['Age'] <= 64), 'Age'] = 3
all_data['Age']=all_data.loc[ all_data['Age'] > 64, 'Age'] = 4 

In [None]:
#Mapping the Fare into 5 groups from 0 to 4
all_data['Fare']=all_data.loc[ all_data['Fare'] <= 7.91, 'Fare'] = 0
all_data['Fare']=all_data.loc[(all_data['Fare'] > 7.91) & (all_data['Fare'] <= 14.454), 'Fare'] = 1
all_data['Fare']=all_data.loc[(all_data['Fare'] > 14.454) & (all_data['Fare'] <= 31), 'Fare']   = 2
all_data['Fare']=all_data.loc[ all_data['Fare'] > 31, 'Fare'] = 3
all_data['Fare']=all_data['Fare'] = all_data['Fare'].astype(int)

In [None]:
#Checking missing values now
missing_value(all_data)

## Coverting the categorical/Object features into numeric form by applying the LabelEncoder function

In [None]:
all_data.dtypes

In [None]:
pro= preprocessing.LabelEncoder()
encpro=pro.fit_transform(all_data['Name'])
all_data['Name'] = encpro

pro= preprocessing.LabelEncoder()
encpro=pro.fit_transform(all_data['Sex'])
all_data['Sex'] = encpro

pro= preprocessing.LabelEncoder()
encpro=pro.fit_transform(all_data['Ticket'].astype(str))
all_data['Ticket'] = encpro

pro= preprocessing.LabelEncoder()
encpro=pro.fit_transform(all_data['Cabin'])
all_data['Cabin'] = encpro

pro= preprocessing.LabelEncoder()
encpro=pro.fit_transform(all_data['Embarked'].astype(str))
all_data['Embarked'] = encpro

## Now splitting the data for training and testing with same index ID's

In [None]:
n = len(y)
train_data = all_data[:n]
test_data = all_data[n:]

<div class="alert alert-block alert-info">  
<h2><center><strong> Building the models for training and testing</strong></center></h2>
        
</div>

<div class="alert alert-block alert-danger">  
<h2><center><strong> Applying Cross Vaildation on each algorithm</strong></center></h2>
        
</div>

In [None]:
X = np.array(train_data)
y = np.array(y)

# Random Forest Machine Algorithm

In [None]:
rf = RandomForestClassifier(min_samples_leaf=1, min_samples_split=2)
kf = KFold(n_splits=5)
outcomes1 = []
ClassR=0
ConM=0
fold = 0
i=0
conf_matrix_list_of_arrays = []
for train_index, test_index in kf.split(X,y):
    i=i+1
    print("KFold Split:",i)
    print('\n')
    fold += 1
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    print('Running time of algorithm')
    %time rf.fit(Xtrain, ytrain)
    predictions = rf.predict(Xtest)
    accuracy = accuracy_score(ytest, predictions)
    outcomes1.append(accuracy)
    print("Accuracy of KFold ",i, "is: ",accuracy)
    print('\n')
    print("Classification Report of KFold ",i," is following:")
    print('\n')
    CR=classification_report(ytest, predictions)
    print(CR)
    print('\n')
    print("Confusion Matrix of KFold ",i," is following:")
    print('\n')
    CM=confusion_matrix(ytest, predictions)
    conf_matrix_list_of_arrays.append(CM)
    print(CM)
    print('\n')
    print('\n')

print('\n')
print('Average Confusion Matrix')
aa = np.mean(conf_matrix_list_of_arrays, axis=0)

aaa = np.ceil(aa)

b=pd.DataFrame(aaa)
b=b.astype(int)
labels =['Not Survived','Survived']

c=np.array(b)

fig, ax = plot_confusion_matrix(conf_mat=c,figsize=(10, 10),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.show()
print('\n')
print('\n')
mean_outcome1 = np.mean(outcomes1)
print("Total Average Accuracy of Random Forest Classifier is : {0}".format(mean_outcome1)) 

# KNN Machine Algorithm

In [None]:
rf = KNeighborsClassifier(n_neighbors=2)
kf = KFold(n_splits=5)
outcomes2 = []
ClassR=0
ConM=0
fold = 0
i=0
conf_matrix_list_of_arrays = []
for train_index, test_index in kf.split(X,y):
    i=i+1
    print("KFold Split:",i)
    print('\n')
    fold += 1
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    print('Running time of algorithm')
    %time rf.fit(Xtrain, ytrain)
    predictions = rf.predict(Xtest)
    accuracy = accuracy_score(ytest, predictions)
    outcomes2.append(accuracy)
    print("Accuracy of KFold ",i, "is: ",accuracy)
    print('\n')
    print("Classification Report of KFold ",i," is following:")
    print('\n')
    CR=classification_report(ytest, predictions)
    print(CR)
    print('\n')
    print("Confusion Matrix of KFold ",i," is following:")
    print('\n')
    CM=confusion_matrix(ytest, predictions)
    conf_matrix_list_of_arrays.append(CM)
    print(CM)
    print('\n')
    print('\n')

print('\n')
print('Average Confusion Matrix')
aa = np.mean(conf_matrix_list_of_arrays, axis=0)

aaa = np.ceil(aa)

b=pd.DataFrame(aaa)
b=b.astype(int)
labels =['Not Survived','Survived']

c=np.array(b)

fig, ax = plot_confusion_matrix(conf_mat=c,figsize=(10, 10),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.show()
print('\n')
print('\n')
mean_outcome2 = np.mean(outcomes2)
print("Total Average Accuracy of KNN Classifier is : {0}".format(mean_outcome2)) 

# Decision Trees Machine Algorithm

In [None]:
rf = DecisionTreeClassifier(random_state=10)
kf = KFold(n_splits=5)
outcomes3 = []
ClassR=0
ConM=0
fold = 0
i=0
conf_matrix_list_of_arrays = []
for train_index, test_index in kf.split(X,y):
    i=i+1
    print("KFold Split:",i)
    print('\n')
    fold += 1
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    print('Running time of algorithm')
    %time rf.fit(Xtrain, ytrain)
    predictions = rf.predict(Xtest)
    accuracy = accuracy_score(ytest, predictions)
    outcomes3.append(accuracy)
    print("Accuracy of KFold ",i, "is: ",accuracy)
    print('\n')
    print("Classification Report of KFold ",i," is following:")
    print('\n')
    CR=classification_report(ytest, predictions)
    print(CR)
    print('\n')
    print("Confusion Matrix of KFold ",i," is following:")
    print('\n')
    CM=confusion_matrix(ytest, predictions)
    conf_matrix_list_of_arrays.append(CM)
    print(CM)
    print('\n')
    print('\n')

print('\n')
print('Average Confusion Matrix')
aa = np.mean(conf_matrix_list_of_arrays, axis=0)

aaa = np.ceil(aa)

b=pd.DataFrame(aaa)
b=b.astype(int)
labels =['Not Survived','Survived']

c=np.array(b)

fig, ax = plot_confusion_matrix(conf_mat=c,figsize=(10, 10),
                                show_absolute=True,
                                show_normed=True,
                                colorbar=True)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.show()
print('\n')
print('\n')
mean_outcome3 = np.mean(outcomes3)
print("Total Average Accuracy of Decision Trees Classifier is : {0}".format(mean_outcome3)) 

# Comparison of all algorithms Results

In [None]:
a=pd.DataFrame()
a['outcomes1']=outcomes1
a['outcomes2']=outcomes2
a['outcomes3']=outcomes3

plt.figure(figsize=(25, 10))
plt.subplot(1,1,1)
plt.plot(a.outcomes1.values,color='blue',label='Random Forest')
plt.plot(a.outcomes2.values,color='green',label='KNN')
plt.plot(a.outcomes3.values,color='red',label='Decision Trees')
plt.title('Algorithms Comparison')
plt.xlabel('Number of time')
plt.ylabel('Accuracy')
plt.legend(bbox_to_anchor=(1, 1))
plt.show()

In [None]:
a=a.rename(columns={'outcomes1':'Random Forest', 'outcomes2':'KNN','outcomes3':'Decision Tree'})
a.plot(kind='bar',figsize=(25, 10))

# Comparison of all algorithms Results

In [None]:
a

<div class="alert alert-block alert-danger">  
<h2><center><strong> Best Model is Random Forest as we can see that it performed well on cross validation</strong></center></h2>
        
</div>

<div class="alert alert-block alert-info">  
<h1><center><strong> You can use my code and apply more robust techniques to get better results. </strong></center></h1>
<h1><center><strong> I hope you like my efforts for Kaggle Community. Thanks 😍</strong></center></h1>
        
</div>