<h1>Import Libraries</h1>

In [3]:
import numpy as np

#data processing
import pandas as pd

#data visualization
import seaborn as sns
#to do visualization done below the code
%matplotlib inline

from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC , LinearSVC

<h1>Set Data Path</h1>

In [4]:
Data_path = './Data/titanic.csv'

In [5]:
df = pd.read_csv(Data_path)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.head() 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [8]:
df.shape

(1310, 14)

<h1>Data Set for Training</h1>

In [9]:
train_df = df.sample(frac=0.8,random_state=42)


<h1>Data Set for Testing</h1>

In [10]:
test_df = df.drop(train_df.index)

<h1>Reset Indexing & Use them</h1>

In [11]:
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [12]:
train_df.shape

(1048, 15)

In [13]:
test_df.shape

(262, 15)

In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048 entries, 0 to 1047
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   index      1048 non-null   int64  
 1   pclass     1047 non-null   float64
 2   survived   1047 non-null   float64
 3   name       1047 non-null   object 
 4   sex        1047 non-null   object 
 5   age        839 non-null    float64
 6   sibsp      1047 non-null   float64
 7   parch      1047 non-null   float64
 8   ticket     1047 non-null   object 
 9   fare       1046 non-null   float64
 10  cabin      241 non-null    object 
 11  embarked   1045 non-null   object 
 12  boat       383 non-null    object 
 13  body       96 non-null     float64
 14  home.dest  602 non-null    object 
dtypes: float64(7), int64(1), object(7)
memory usage: 122.9+ KB


In [16]:
train_df.describe()

Unnamed: 0,index,pclass,survived,age,sibsp,parch,fare,body
count,1048.0,1047.0,1047.0,839.0,1047.0,1047.0,1046.0,96.0
mean,648.078244,2.282713,0.375358,30.004172,0.488061,0.362942,34.303509,156.96875
std,377.423892,0.839009,0.484447,14.424104,1.065539,0.799188,54.240092,97.279895
min,0.0,1.0,0.0,0.4167,0.0,0.0,0.0,1.0
25%,322.75,1.5,0.0,21.0,0.0,0.0,7.8958,69.75
50%,646.5,3.0,0.0,28.0,0.0,0.0,14.4542,148.5
75%,970.25,3.0,1.0,39.0,1.0,0.0,31.3875,255.25
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,328.0


In [17]:
train_df.head()

Unnamed: 0,index,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,701,3.0,0.0,"Calic, Mr. Petar",male,17.0,0.0,0.0,315086,8.6625,,S,,,
1,994,3.0,0.0,"Mardirosian, Mr. Sarkis",male,,0.0,0.0,2655,7.2292,F E46,C,,,
2,350,2.0,1.0,"Brown, Miss. Edith Eileen",female,15.0,0.0,2.0,29750,39.0,,S,14.0,,"Cape Town, South Africa / Seattle, WA"
3,986,3.0,0.0,"Maenpaa, Mr. Matti Alexanteri",male,22.0,0.0,0.0,STON/O 2. 3101275,7.125,,S,,,
4,409,2.0,0.0,"Fox, Mr. Stanley Hubert",male,36.0,0.0,0.0,229236,13.0,,S,,236.0,"Rochester, NY"


<h1>Missing Data</h1>

In [25]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1,2)).sort_values(ascending=False)
missing_data = pd.concat([total,percent_2],axis=1,keys=['Total','% Percentage'])
missing_data

Unnamed: 0,Total,% Percentage
body,952,90.84
cabin,807,77.0
boat,665,63.45
home.dest,446,42.56
age,209,19.94
embarked,3,0.29
fare,2,0.19
pclass,1,0.1
survived,1,0.1
name,1,0.1


In [26]:
train_df.columns.values

array(['index', 'pclass', 'survived', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body',
       'home.dest'], dtype=object)

In [None]:
survived = 'survived'
not_survived = 'not survived'
fig,axes = plt.subplot(nrows = 1 , ncols = 2 ,figsize = (10,4))
women = train_df[train_df['sex']=='female']
men = train_df[train_df['sex']=='male']
