# Task-1 TITANIC SURVIVAL PREDICTION

## Importing the libraries

In [57]:
!pip install catboost



In [58]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [59]:
dataset = pd.read_csv('Titanic-Dataset.csv')

In [60]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
dataset = dataset.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [62]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [63]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [64]:
print(X)

[[3 'male' 22.0 ... 0 7.25 'S']
 [1 'female' 38.0 ... 0 71.2833 'C']
 [3 'female' 26.0 ... 0 7.925 'S']
 ...
 [3 'female' nan ... 2 23.45 'S']
 [1 'male' 26.0 ... 0 30.0 'C']
 [3 'male' 32.0 ... 0 7.75 'Q']]


# Taking care of missing data

In [65]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
age_data =X[:, 2].reshape(-1, 1)
imputer.fit(age_data)
X[:, 2] = imputer.transform(age_data).flatten()

In [66]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
embarked_data =X[:, -1].reshape(-1, 1)
imputer.fit(embarked_data)
X[:, -1] = imputer.transform(embarked_data).flatten()

In [67]:
print(X[61,:])

[1 'female' 38.0 0 0 80.0 'S']


## Encoding Categorical Data

In [68]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])

In [69]:
print(X[:5,:])

[[3 1 22.0 1 0 7.25 'S']
 [1 0 38.0 1 0 71.2833 'C']
 [3 0 26.0 0 0 7.925 'S']
 [1 0 35.0 1 0 53.1 'S']
 [3 1 35.0 0 0 8.05 'S']]


In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [71]:
print(X[:10,:])

[[0.0 0.0 1.0 3 1 22.0 1 0 7.25]
 [1.0 0.0 0.0 1 0 38.0 1 0 71.2833]
 [0.0 0.0 1.0 3 0 26.0 0 0 7.925]
 [0.0 0.0 1.0 1 0 35.0 1 0 53.1]
 [0.0 0.0 1.0 3 1 35.0 0 0 8.05]
 [0.0 1.0 0.0 3 1 28.0 0 0 8.4583]
 [0.0 0.0 1.0 1 1 54.0 0 0 51.8625]
 [0.0 0.0 1.0 3 1 2.0 3 1 21.075]
 [0.0 0.0 1.0 3 0 27.0 0 2 11.1333]
 [1.0 0.0 0.0 2 0 14.0 1 0 30.0708]]


## Splitting the dataset into the Training set and Test set

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training CatBoost on the Training set

In [77]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

Learning rate set to 0.008911
0:	learn: 0.6871003	total: 49.7ms	remaining: 49.7s
1:	learn: 0.6837883	total: 52.1ms	remaining: 26s
2:	learn: 0.6788225	total: 56.8ms	remaining: 18.9s
3:	learn: 0.6737650	total: 63.2ms	remaining: 15.7s
4:	learn: 0.6688102	total: 67.6ms	remaining: 13.5s
5:	learn: 0.6641584	total: 69.2ms	remaining: 11.5s
6:	learn: 0.6599288	total: 70.3ms	remaining: 9.97s
7:	learn: 0.6547570	total: 74.4ms	remaining: 9.22s
8:	learn: 0.6501302	total: 79.3ms	remaining: 8.73s
9:	learn: 0.6453269	total: 82.8ms	remaining: 8.2s
10:	learn: 0.6403461	total: 86.3ms	remaining: 7.76s
11:	learn: 0.6353747	total: 87.7ms	remaining: 7.22s
12:	learn: 0.6309128	total: 89.1ms	remaining: 6.76s
13:	learn: 0.6271044	total: 95.3ms	remaining: 6.71s
14:	learn: 0.6232022	total: 96.4ms	remaining: 6.33s
15:	learn: 0.6186356	total: 97.9ms	remaining: 6.02s
16:	learn: 0.6144964	total: 99.2ms	remaining: 5.74s
17:	learn: 0.6102960	total: 101ms	remaining: 5.5s
18:	learn: 0.6065982	total: 102ms	remaining: 5.28

<catboost.core.CatBoostClassifier at 0x7f6eb32ef340>

## Making the Confusion Matrix

In [78]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[104   6]
 [ 23  46]]


0.8379888268156425

## Applying k-Fold Cross Validation

In [79]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6:	learn: 0.6605790	total: 9.25ms	remaining: 1.31s
7:	learn: 0.6561064	total: 10.5ms	remaining: 1.3s
8:	learn: 0.6509304	total: 11.8ms	remaining: 1.3s
9:	learn: 0.6461857	total: 13.3ms	remaining: 1.31s
10:	learn: 0.6411295	total: 14.5ms	remaining: 1.3s
11:	learn: 0.6363719	total: 16.4ms	remaining: 1.35s
12:	learn: 0.6322359	total: 18.2ms	remaining: 1.38s
13:	learn: 0.6274169	total: 19.5ms	remaining: 1.37s
14:	learn: 0.6235144	total: 20.6ms	remaining: 1.35s
15:	learn: 0.6201032	total: 21.3ms	remaining: 1.31s
16:	learn: 0.6160007	total: 22.6ms	remaining: 1.3s
17:	learn: 0.6115781	total: 23.8ms	remaining: 1.3s
18:	learn: 0.6078487	total: 25ms	remaining: 1.29s
19:	learn: 0.6035678	total: 26.3ms	remaining: 1.29s
20:	learn: 0.5999498	total: 27.5ms	remaining: 1.28s
21:	learn: 0.5964621	total: 28.9ms	remaining: 1.28s
22:	learn: 0.5931218	total: 30.2ms	remaining: 1.28s
23:	learn: 0.5898008	total: 31.2ms	remaining: 1.27s
24:	learn: