## Importing the library

In [1]:
import pandas as pd

## Analysis of dataset

In [2]:
#Loading the dataset into a dataframe named df
df = pd.read_csv("titanic_data_set.csv")
#Shows top 5 rows of the data set
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#printing shape
print "Rows and Columns : " + str(df.shape)

Rows and Columns : (891, 12)


In [4]:
#describing dataset
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.describe(include = ['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Graham, Mr. George Edward",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [6]:
#To check if there is any missing data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Deriving relationship between features and survival
Here, we will analyze relationship between different features with respect to Survival. We see how different feature values show different survival chance.

In [7]:
survived = df[df["Survived"]==1]
survived.head()
print "Number of passengers who survived : " + str(len(survived)) 

Number of passengers who survived : 342


In [8]:
not_survived = df[df["Survived"]==0]
not_survived.head()
print "Number of passengers who did not survive : " + str(len(not_survived))

Number of passengers who did not survive : 549


In [9]:
s = (float(len(survived))/len(df)*100.0)
ns = (float(len(not_survived))/len(df)*100.0)
print "Percentage of passengers who survived : %.2f" %(s) + "%"
print "Percentage of passengers who did not survive : %.2f" %(ns) + "%"
print "Total number of passengers : %d" %len(df)

Percentage of passengers who survived : 38.38%
Percentage of passengers who did not survive : 61.62%
Total number of passengers : 891


#### Pclass vs Survival
Higher class passengers have better chances of survival

In [10]:
df.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [11]:
df.groupby('Pclass').Survived.value_counts()

Pclass  Survived
1       1           136
        0            80
2       0            97
        1            87
3       0           372
        1           119
Name: Survived, dtype: int64

#### Sex vs Survival
Females have a better chance of survival

In [12]:
df.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [13]:
df.groupby('Sex').Survived.value_counts()

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: Survived, dtype: int64

#### Analysing females and males of different classes who survived

In [14]:
df.groupby(["Pclass","Sex"]).Survived.value_counts()

Pclass  Sex     Survived
1       female  1            91
                0             3
        male    0            77
                1            45
2       female  1            70
                0             6
        male    0            91
                1            17
3       female  0            72
                1            72
        male    0           300
                1            47
Name: Survived, dtype: int64

In [15]:
i = df.groupby(["Pclass","Sex"]).Survived.value_counts()
for key,values, in i.iteritems():
    print key, values

(1L, 'female', 1L) 91
(1L, 'female', 0L) 3
(1L, 'male', 0L) 77
(1L, 'male', 1L) 45
(2L, 'female', 1L) 70
(2L, 'female', 0L) 6
(2L, 'male', 0L) 91
(2L, 'male', 1L) 17
(3L, 'female', 0L) 72
(3L, 'female', 1L) 72
(3L, 'male', 0L) 300
(3L, 'male', 1L) 47


In [16]:
perc_of_1class_Survived = i[1,'female',1].astype('float32') / (i[1,'female',1] + i[1, 'female' ,0]).astype('float32')
print "Percentage of females of class 1 who survived : %.2f" %(perc_of_1class_Survived *100)

Percentage of females of class 1 who survived : 96.81


In [17]:
perc_of_1class_Survived = i[1,'male',1].astype('float32') / (i[1,'male',1] + i[1, 'male' ,0]).astype('float32')
print "Percentage of males of class 1 who survived : %.2f" %(perc_of_1class_Survived *100)

Percentage of males of class 1 who survived : 36.89


In [18]:
perc_of_2class_Survived = i[2,'female',1].astype('float32') / (i[2,'female',1] + i[2, 'female' ,0]).astype('float32')
print "Percentage of females of class 2 who survived : %.2f" %(perc_of_2class_Survived *100)

Percentage of females of class 2 who survived : 92.11


In [19]:
perc_of_2class_Survived = i[2,'male',1].astype('float32') / (i[2,'male',1] + i[2, 'male' ,0]).astype('float32')
print "Percentage of males of class 2 who survived : %.2f" %(perc_of_2class_Survived *100)

Percentage of males of class 2 who survived : 15.74


In [20]:
perc_of_3class_Survived = i[3,'female',1].astype('float32') / (i[3,'female',1] + i[3, 'female' ,0]).astype('float32')
print "Percentage of females of class 3 who survived : %.2f" %(perc_of_3class_Survived *100)

Percentage of females of class 3 who survived : 50.00


In [21]:
perc_of_3class_Survived = i[3,'male',1].astype('float32') / (i[3,'male',1] + i[3, 'male' ,0]).astype('float32')
print "Percentage of males of class 3 who survived : %.2f" %(perc_of_3class_Survived *100)

Percentage of males of class 3 who survived : 13.54


## Visualising dataset

In [22]:
import matplotlib.pyplot as mt
%matplotlib inline
import seaborn as se
se.set()