# Import Data

In [None]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing the dataset

train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')


# EDA

In [None]:
# Lets look at the top 5 rows
train_df.head()

In [None]:
# Checking for null values
train_df.isnull().sum()

In [None]:
print(train_df['Embarked'].unique())
print(train_df['Cabin'].unique())
print(train_df['Ticket'].unique())

**Data Processing**

1. One-Hot encoding
2. Mean encoding

In [None]:
#Remove unnecessary features
train_df = train_df.drop(['Name', 'Ticket', 'PassengerId'], axis = 1 )
train_df.head()

*one-hot encoding*

In [None]:
Sex_encoding = pd.get_dummies(train_df['Sex'])

In [None]:
train_df = pd.concat([train_df, Sex_encoding],axis =1)
train_df = train_df.drop(['Sex'],axis=1)
train_df

*Mean encoding*

In [None]:
#Embarked_mean

target = 'Survived'
Embarked_mean = train_df.groupby('Embarked')[target].mean()

train_df['Embarked_mean'] = train_df['Embarked'].map(Embarked_mean)
train_df = train_df.drop(['Embarked'] , axis = 1)
train_df.head()

In [None]:
#Cabin_mean

Cabin_mean = train_df.groupby('Cabin')[target].mean()

train_df['Cabin_mean'] = train_df['Cabin'].map(Cabin_mean)
train_df = train_df.drop(['Cabin'] , axis = 1)

train_df.head()

**Mean imputation of null values**

In [None]:
# null값이 반 이상 차지하는 Cabin_mean은 제거하고 Age와 Embarked_mean의 null 값을 평균으로 대치한다.
# Cabin_mean, which occupies more than half of null values, is removed and the null values of Age and Embarked_mean are replaced with the average.
train_df.isnull().sum()
train_df = train_df.drop(['Cabin_mean'], axis = 1)
train_df['Age_not_null'] = train_df['Age'].fillna(train_df['Age'].mean())

train_df['Embarked_not_null'] = train_df['Embarked_mean'].fillna(train_df['Embarked_mean'].mean())
train_df = train_df.drop(['Age', 'Embarked_mean'],axis=1)

train_df

In [None]:
train_df

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

x = train_df.drop(['Survived'],axis=1)
y = train_df['Survived']

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

# Visualization

*Finding outliers*

In [None]:
# Boxplot for ejection_fraction

sns.boxplot(x = train_df.Fare, color = 'teal')
plt.show()


# We can see there are a lot of outliers.
# train_df[train_df['Fare']>=100]

In [None]:
sns.boxplot(x=train_df.Embarked_not_null, color = 'teal')
plt.show()

In [None]:
sns.boxplot(x=train_df.Age_not_null, color = 'teal')
plt.show()

In [None]:
# Distribution of Age

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(
    x = train_df['Age_not_null'],
    xbins=dict( # bins used for histogram
        start=0,
        end=95,
        size=2
    ),
    marker_color='#e8ab60',
    opacity=1
))

fig.update_layout(
    title_text='AGE DISTRIBUTION',
    xaxis_title_text='AGE',
    yaxis_title_text='COUNT', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False },
    template = 'plotly_dark'
)

fig.show()

In [None]:
# Now lets categorize the above histogram by Survived

import plotly.express as px
fig = px.histogram(train_df, x="Age_not_null", color="Survived", marginal="violin", hover_data=train_df.columns, 
                   title ="Distribution of AGE Vs Survived", 
                   labels={"age": "AGE"},
                   template="plotly_dark",
                   color_discrete_map={"0": "RebeccaPurple", "1": "MediumPurple"}
                  )
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    x = train_df['Fare'],
    xbins=dict( # bins used for histogram
        start=0,
        end=582,
        size=15
    ),
    marker_color='#FE6F5E',
    opacity=1
))

fig.update_layout(
    title_text='FARE DISTRIBUTION',
    xaxis_title_text='FARE',
    yaxis_title_text='COUNT', 
    bargap=0.05, # gap between bars of adjacent location coordinates
    xaxis =  {'showgrid': False },
    yaxis = {'showgrid': False },
    template = 'plotly_dark'
)

fig.show()

In [None]:
# Now lets categorize the above histogram by Survived

import plotly.express as px
fig = px.histogram(train_df, x="Fare", color="Survived", marginal="violin", hover_data=train_df.columns,
                   title ="Distribution of FARE Vs Survived", 
                   labels={"fare": "FARE"},
                   template="plotly_dark",
                   color_discrete_map={"0": "RebeccaPurple", "1": "MediumPurple"})
fig.show()

# Machine Learning Modeling

In [None]:
train_df['female'] = train_df['female'].astype('int64')
train_df['male'] = train_df['male'].astype('int64')
train_df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
# data segmentation
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state =0)

In [None]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

*LOGISTIC REGRESSION*

In [None]:
# Applying logistic regression on the training set

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
# Making Confusion Matrix and calculating accuracy score

mylist = []
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

In [None]:
# Finding the optimum number of neighbors 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for neighbors in range(3,10):
    classifier = KNeighborsClassifier(n_neighbors=neighbors, metric='minkowski')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot(list(range(3,10)), list1)
plt.show()

*K NEAREST NEIGHBOR*

In [None]:
# Training the K Nearest Neighbor Classifier on the Training set

classifier = KNeighborsClassifier(n_neighbors=6)
classifier.fit(x_train, y_train)

In [None]:
# Predicting the Test set results

y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

*SUPPORT VECTOR CLASSIFIER*

In [None]:
#Finding the optimum number of n_estimators
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for c in [0.5,0.6,0.7,0.8,0.9,1.0]:
    classifier = SVC(C = c, random_state=0, kernel = 'rbf')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot([0.5,0.6,0.7,0.8,0.9,1.0], list1)
plt.show()

In [None]:
classifier = SVC(C = 0.6, random_state=0, kernel = 'rbf')
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)
mylist.append(ac)

*DECISION TREE CLASSIFIER*

In [None]:
#Finding the optimum number of n_estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for leaves in range(2,10):
    classifier = DecisionTreeClassifier(max_leaf_nodes = leaves, random_state=0, criterion='entropy')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(2,10)), list1)
plt.show()

In [None]:
classifier = DecisionTreeClassifier(max_leaf_nodes = 3, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)
mylist.append(ac)

*RANDOM FOREST CLASSIFCATION*

In [None]:
#Finding the optimum number of n_estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30):
    classifier = RandomForestClassifier(n_estimators = estimators, random_state=0, criterion='entropy')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30)), list1)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 11, criterion='entropy', random_state=0)
classifier.fit(x_train,y_train)

In [None]:
y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30,1):
    classifier = XGBClassifier(n_estimators = estimators, max_depth=12, subsample=0.7)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30,1)), list1)
plt.show()

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 10, max_depth=12, subsample=0.7)
classifier.fit(x_train,y_train)

In [None]:
y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

In [None]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
print(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
mylist.append(ac)
print(cm)
print(ac)

# Deep Learning Modeling

### ANN

In [None]:
np.random.seed(0)
import tensorflow as tf

# Initialising the ANN

ann = tf.keras.models.Sequential()

In [None]:
# Adding the input layer and the first hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))
ann.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy' , metrics = ['accuracy'] )

In [None]:
ann.fit(x_train, y_train, batch_size = 32, epochs = 100)

In [None]:
y_pred = ann.predict(x_test)
y_pred = (y_pred > 0.5)
np.set_printoptions()

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix")
print(cm)
print()

# accuracy
ac = accuracy_score(y_test,y_pred)
print("Accuracy")
print(ac)
mylist.append(ac)

# Model evaluation

In [None]:
mylist2 = ["Logistic Regression", "KNearestNeighbours","SupportVector","DecisionTree","RandomForest","ANN", "XGBOOST","CATBOOST"]

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

**I gave you the basics of machine learning and deep learning. You should try things like generating derived variables, engineering features, and tuning hyperparameters to improve performance. Good luck!** 👍💥👍 