In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## **Reading data using pandas**
use pandas read_csv function to read csv file in python and pandas DataFrame method to covert file into the data frame

In [3]:
df = pd.DataFrame(pd.read_csv('/content/sample_data/train (1).csv'))
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Using shape function to find number og columns and number of raws in the data set

In [None]:
df.shape

(891, 12)

# **Handling NULL values**
The dataset may contains many raws and columns for which some values are missing,we can leave these missing values as it is. In such cases we have two options
        1. Either drop the entire row or column.
        2. Fill the missing values with some appropriate value.

In [None]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Seperating out the coloumns which have more than 35% of the values missing in the dataset

In [None]:
drop_col = df.isnull().sum()[df.isnull().sum()>(35/100 * df.shape[0])]
drop_col

Cabin    687
dtype: int64

 Drop the cabin column from the dataset 

In [None]:
drop_col.index

Index(['Cabin'], dtype='object')

In [None]:
df.drop(drop_col.index, axis=1, inplace=True)
df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

Using fillna function, the Age column can be fill by its mean value

In [None]:
df.fillna(df.mean(), inplace = True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

# Because Embarked column contains String values, so we can't take mean values. so we see the deatails of that column seperately from others. 

In [None]:
df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

 Then we can fill the NULL values of column **"Embarked"** with the most frequent value in the column

In [None]:
df['Embarked'].fillna('S',inplace=True)

In [None]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Now all the ***null*** values have been filled

Next we can get the correlation of the coloumns

In [None]:
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0


**sibsh**:Number of siblings/spouses abroad

**parch**:Number of parentss/children abroad

So we can make a new coloum family_size by combining these two columns


In [None]:
df['Familysize'] = df['SibSp']+df['Parch']
df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,Familysize
PassengerId,1.0,-0.005007,-0.035144,0.033207,0.012658,-0.040143
Survived,-0.005007,1.0,-0.338481,-0.069809,0.257307,0.016639
Pclass,-0.035144,-0.338481,1.0,-0.331339,-0.5495,0.065997
Age,0.033207,-0.069809,-0.331339,1.0,0.091566,-0.248512
Fare,0.012658,0.257307,-0.5495,0.091566,1.0,0.217138
Familysize,-0.040143,0.016639,0.065997,-0.248512,0.217138,1.0


**Familysize in the ship does not have much correlance with survival rate**

Let's check whether the person was alone or not can affect survival rate

In [None]:
df['Alone'] = [0 if df['Familysize'][i]>0 else 1 for i in df.index]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,Familysize,Alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,S,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,S,0,1


In [None]:
df.groupby(['Alone'])['Survived'].mean()

Alone
0    0.505650
1    0.303538
Name: Survived, dtype: float64

**If the person is alone he/she has less chance of surviving**

The reason might be the person who is traveling with his family might belongiing to rich class and might be prioritied over other.

In [None]:
df[['Alone', 'Fare']].corr()

Unnamed: 0,Alone,Fare
Alone,1.0,-0.271832
Fare,-0.271832,1.0


So we can see if the person was not alone, the chance the ticket price is higher is high

In [None]:
df['Sex'] = [0 if df['Sex'][i]=='male' else 1 for i in df.index] # 1 for female 0 for male
df.groupby(['Sex'])['Survived'].mean()

Sex
0    0.188908
1    0.742038
Name: Survived, dtype: float64

It shows, female passengers have more chance of surviving thane male ones

It shows, women were proritized over men.

In [None]:
df.groupby(['Embarked'])['Survived'].mean()

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

#**Conclusion**



*   Female passengers were proritized over men.
*   People with high class or rich people have higher survival rate than others.The hierarchy might have been followed while saving the passangers.
*   Passengers who travelling with their family have higher rate.
*   Passengers who borded the ship at Cherbourg, survived more in propotion than the others.



