In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve
import seaborn as sns

Read Data

In [None]:
# Reading data
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('Train data size',train.shape)
print('Test data size',test.shape)

print(train['Survived'].value_counts(),'\n')
print(Counter(train['Sex']),'\n')
print(train['SibSp'].value_counts(),'\n')
print(train['Parch'].value_counts(),'\n')
print(train['Embarked'].value_counts(),'\n')

In [None]:
rcParams['figure.figsize'] = 10,5
sb.barplot(x = train['Survived'].value_counts().index, y = train['Survived'].value_counts().values)
plt.title('Survival counts')
plt.xlabel('Survived')
plt.ylabel('No of passengers')
plt.show()

In [None]:
gender = pd.crosstab(train['Survived'],train['Sex'])
gender.plot(kind="bar",title='No of passengers survived')
plt.show()

In [None]:
rcParams['figure.figsize'] = 10,5
ax = train['Age'].hist(bins = 15,alpha = 0.9, color = 'green')
ax.set(xlabel = 'Age',ylabel = 'Count',title = 'Visualization of Ages')
plt.show()

In [None]:
rcParams['figure.figsize'] = 10,10
sb.heatmap(train.corr(),annot = True,square = True,linewidths = 2,linecolor = 'black')

Data Preparation

In [None]:
y = train["Survived"]

# getting dummy variables column

enc = LabelEncoder()

train['Sex'] = enc.fit_transform(train['Sex'])
test['Sex'] = enc.fit_transform(test['Sex'])

train['Name'] = enc.fit_transform(train['Name'])
test['Name'] = enc.fit_transform(test['Name'])

train['Cabin'] = enc.fit_transform(train['Cabin'].astype('str'))
test['Cabin'] = enc.fit_transform(test['Cabin'].astype('str'))

train['Embarked'] = enc.fit_transform(train['Embarked'].astype('str'))
test['Embarked'] = enc.fit_transform(test['Embarked'].astype('str'))

train['Ticket'] = enc.fit_transform(train['Ticket'].astype('category'))
test['Ticket'] = enc.fit_transform(test['Ticket'].astype('category'))
 
X = train
X_test = test

In [None]:
X.set_index(['PassengerId'],inplace = True)
X_test.set_index(['PassengerId'],inplace = True)
X = X.drop(['Survived'], axis=1)

In [None]:
X.tail()

In [None]:
X_test.head()

In [None]:
#Normalizing Age column

no = 1

X["Age"]=((X["Age"]-X["Age"].min())/(X["Age"].max()-X["Age"].min()))*no
X_test["Age"]=((X_test["Age"]-X_test["Age"].min())/(X_test["Age"].max()-X_test["Age"].min()))*no

X["Name"]=((X["Name"]-X["Name"].min())/(X["Name"].max()-X["Name"].min()))*no
X_test["Name"]=((X_test["Name"]-X_test["Name"].min())/(X_test["Name"].max()-X_test["Name"].min()))*no

X["Ticket"]=((X["Ticket"]-X["Ticket"].min())/(X["Ticket"].max()-X["Ticket"].min()))*no
X_test["Ticket"]=((X_test["Ticket"]-X_test["Ticket"].min())/(X_test["Ticket"].max()-X_test["Ticket"].min()))*no

X["Fare"]=((X["Fare"]-X["Fare"].min())/(X["Fare"].max()-X["Fare"].min()))*no
X_test["Fare"]=((X_test["Fare"]-X_test["Fare"].min())/(X_test["Fare"].max()-X_test["Fare"].min()))*no

X["Cabin"]=((X["Cabin"]-X["Cabin"].min())/(X["Cabin"].max()-X["Cabin"].min()))*no
X_test["Cabin"]=((X_test["Cabin"]-X_test["Cabin"].min())/(X_test["Cabin"].max()-X_test["Cabin"].min()))*no

X["SibSp"]=((X["SibSp"]-X["SibSp"].min())/(X["SibSp"].max()-X["SibSp"].min()))*no
X_test["SibSp"]=((X_test["SibSp"]-X_test["SibSp"].min())/(X_test["SibSp"].max()-X_test["SibSp"].min()))*no

X["Embarked"]=((X["Embarked"]-X["Embarked"].min())/(X["Embarked"].max()-X["Embarked"].min()))*no
X_test["Embarked"]=((X_test["Embarked"]-X_test["Embarked"].min())/(X_test["Embarked"].max()-X_test["Embarked"].min()))*no

X["Pclass"]=((X["Pclass"]-X["Pclass"].min())/(X["Pclass"].max()-X["Pclass"].min()))*no
X_test["Pclass"]=((X_test["Pclass"]-X_test["Pclass"].min())/(X_test["Pclass"].max()-X_test["Pclass"].min()))*no

X["Parch"]=((X["Parch"]-X["Parch"].min())/(X["Parch"].max()-X["Parch"].min()))*no
X_test["Parch"]=((X_test["Parch"]-X_test["Parch"].min())/(X_test["Parch"].max()-X_test["Parch"].min()))*no


X.tail(1000)

In [None]:
print(X.isnull().sum(),'\n')
print(X_test.isnull().sum())

there is missing data in age

In [None]:
X.fillna(X.median(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)
print(X.isnull().sum(),'\n')
print(X_test.isnull().sum())

Modeling

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 10,test_size=0.2)

In [None]:
print("Logistic Regression:", cross_val_score(LogisticRegression(), X_train, y_train).mean())

print("SVC:", cross_val_score(SVC(), X_train, y_train).mean())

print("Random Forest:", cross_val_score(RandomForestClassifier(), X_train, y_train).mean())

print("GaussianNB:", cross_val_score(GaussianNB(), X_train, y_train).mean())

print("Decision Tree:", cross_val_score(DecisionTreeClassifier(), X_train, y_train).mean())

print("KNeighbors:", cross_val_score(KNeighborsClassifier(), X_train, y_train).mean())

print("MLP:", cross_val_score(MLPClassifier(), X_train, y_train).mean())

print("XGB-TREE:", cross_val_score(XGBClassifier(booster='gbtree'), X_train, y_train).mean())

print("XGB-DART:", cross_val_score(XGBClassifier(booster='dart'), X_train, y_train).mean())

1. **1- I will use Random Forest**

In [None]:
# fit the model on the whole dataset
model = RandomForestClassifier()

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

print(accuracy_score(y_test,y_pred).round(4)*100,'\n')

print(pd.crosstab(y_test,y_pred),'\n')

print(classification_report(y_test,y_pred),'\n')

In [None]:
auc = roc_auc_score(y_test,y_pred)
print('Decision Tree accuarcy : %.2f'%auc)

In [None]:
test_predict = model.predict(test)

In [None]:
test_predict = pd.Series(test_predict)

In [None]:
test.reset_index(inplace = True)
test.head()

In [None]:
predict = test['PassengerId']

In [None]:
predict = pd.concat([predict,test_predict], axis=1)

In [None]:
predict.rename(columns={0: "Survived"},inplace=True)

In [None]:
predict.to_csv("submission.csv",index=False)

In [None]:
sb.countplot(predict.Survived)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, center=True)
plt.show()