<a href="https://www.kaggle.com/code/rupanshirana/titanic-eda?scriptVersionId=116528575" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading the data set of titanic-data/train

In [None]:
df=pd.read_csv("//kaggle//input/titanic-data//train.csv")
df

## checking the shape of data

In [None]:
df.shape

### how does the data look?

In [None]:
df.head()  ## viewing top 5 rows
df.sample(5) ## randomly choose 5 rows to get rid off biased nature of data that may seen 

### checking the column types

In [None]:
df.dtypes

## Preprocessing
- we want survived,Pclass data in categorical form to do analysis, so we will change their data type to object for analysis
- from this, we can say survived and pclass shoud be of categorical type as they are off numeric type now

#### checking the columns names

In [None]:
df.columns

## basic information

In [None]:
df.info()
## from this we know, age, cabin and embarked have missing values

#### checking the missing values

In [None]:
df.isnull().sum()

### checking missing %

In [None]:
for i in df.columns:
    if df[i].isnull().any():
        print(i,"----",df[i].isnull().sum()/df.shape[0]*100)

In [None]:
df["Cabin"].unique()
## we want alphabet only ,to do analysis because of numbers we are messy data

#### nan value default type is float, so fill the missing values of cabin first

In [None]:
df["Cabin"].fillna(df["Cabin"].mode()[0],inplace=True)

In [None]:
df["Cabin_name"]=df["Cabin"].apply(lambda x:x[0])
df["Cabin_name"]

In [None]:
df["Cabin_name"].unique()

### filling the missing values of embarked and age

In [None]:
df["Embarked"].fillna(df["Embarked"].mode()[0],inplace=True)

### since age is of numeric type, either we can it fill mean or median but first we have to check for outliers than only we can take the decision

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(df["Age"])

In [None]:
df["Age"].skew()

In [None]:
df["Age"].fillna(df["Age"].median(),inplace=True)

In [None]:
df.isnull().sum()

### from this data we can find passengerid and ticket is irrelevant so drop/delete it

In [None]:
df.drop(columns=["PassengerId","Ticket","Cabin"],inplace=True)

In [None]:
df.shape

In [None]:
df.duplicated().sum()  ## no rows is duplicated

In [None]:
for i in df.columns:
    print(i,"---------------",df[i].unique())

## checking statistical summary of data

In [None]:
df.describe().T  ## for numeric data type

In [None]:
df.describe(include="object").T  ## for object data type

### checking the correlation

In [None]:
df.corr()

- from this we know,there is no strong correlation between the columns
- relation between pclass and fare is of moderate that is -0.549500, it clearly depicts the fare of first passenger class is high

In [None]:
sns.heatmap(df.corr(),annot=True)

## EDA

### checking the column types again

In [None]:
df.dtypes

In [None]:
l=["Survived","Pclass"]
for i in l:
    df[i]= df[i].astype("object")

In [None]:
df.dtypes

In [None]:
df.info()

## first fetch categorical object

In [None]:
df.select_dtypes(include="object").columns

### taking columns one by one (univariate analysis)

### Survived

In [None]:
df["Survived"].unique()

In [None]:
df["Survived"].value_counts()

## OR

In [None]:
import seaborn as sns

In [None]:
sns.countplot(df["Survived"])

- insights: more people died 

In [None]:
import matplotlib.pyplot as plt

### how many percent survived

In [None]:
l1=["not_survived","survived"]
l=list(df["Survived"].value_counts())
l
plt.pie(l,labels=l1,autopct="%0.0f%%");
plt.legend()

In [None]:
df["Survived"].value_counts()/len(df)*100  ## using pandas

## PClass

In [None]:
df["Pclass"].unique()

In [None]:
df["Pclass"].value_counts()

In [None]:
sns.countplot(df["Pclass"])

- insights: people travelled more in 3rd class and min in 2nd class

In [None]:
l1=["3","2","1"]
l=list(df["Pclass"].value_counts())
l
plt.pie(l,labels=l1,autopct="%0.0f%%");
plt.legend()

In [None]:
df["Pclass"].value_counts()/len(df)*100

- insights: 55% people travelled in 3rd class, 24% travelled in 2nd class and 21% travelled in 1st class

## Sex

In [None]:
df["Sex"].unique()

In [None]:
df["Sex"].value_counts()

In [None]:
sns.countplot(df["Sex"])

In [None]:
l1=["m","f"]
l=list(df["Sex"].value_counts())
l
plt.pie(l,labels=l1,autopct="%0.0f%%");
plt.legend()

In [None]:
df["Sex"].value_counts()/len(df)*100

- insights : 64% male travelled while female was 35%

## Embarked

In [None]:
df["Embarked"].unique()

In [None]:
df["Embarked"].value_counts()

In [None]:
sns.countplot(df["Embarked"])

In [None]:
l1=["S","C","Q"]
l=list(df["Embarked"].value_counts())
l
plt.pie(l,labels=l1,autopct="%0.0f%%");
plt.legend()

In [None]:
df["Embarked"].value_counts()/len(df)*100

- insights: 72% people travelled in s , 18% in c and 8% in q

## Cabin_name

In [None]:
df["Cabin_name"].unique()

In [None]:
df["Cabin_name"].value_counts()

In [None]:
sns.countplot(df["Cabin_name"])

In [None]:
plt.figure(figsize=(10,12))
l1=["B","C","E","G","D","A","F","T"]
l=list(df["Cabin_name"].value_counts())
l
plt.pie(l,labels=l1,autopct="%0.0f%%");
plt.legend()

In [None]:
df["Cabin_name"].value_counts()/len(df)*100

- insights : 82% were in B cabin

## Name

In [None]:
df["Name"].nunique()

### fetching salutations of names (feature splitting)

In [None]:
df["Name"].unique()

### feature splitting part of feature construction under feature engineering

In [None]:
df["salutation"]=df["Name"].str.split(",",expand=True)[1].str.split(".",expand=True)[0]
df["salutation"]


In [None]:
df["salutation"].unique()

In [None]:
plt.figure(figsize=(8,10))
l1=[' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
       ' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer']
l=list(df["salutation"].value_counts())
l
plt.pie(l,labels=l1,autopct="%0.0f%%",labeldistance=2.1);
plt.legend()

- Can not see properly, so we will use countplot to plot 

In [None]:
sns.countplot(df["salutation"]);
plt.xticks(rotation=90);

- 'Mr' Salutation was used most and 'Jonkheer' was used least

## Taking numeric data

In [None]:
numeric=["int","float"]
df.select_dtypes(include=numeric).columns

## sibsp, Parch

- for family count, we will add sibsp and parch columns, again feature engineering

In [None]:
df["family_members"]=df["Parch"]+df["SibSp"]
df.head()


In [None]:
sns.countplot(df["family_members"])

- insights: alone category people travelled more 

## Age

In [None]:
df["Age"].agg(["min","max","mean"])

- average age of passengers : 29

In [None]:
sns.distplot(df["Age"])

### OR

In [None]:
sns.distplot(df["Age"],kde=False) #for histogram only

In [None]:
sns.boxplot(df["Age"]);

In [None]:
df["Age"].skew()

In [None]:
df["Age"].unique()

- 20-40 was average age of travellers

- how many people are of different age ? 
- binning :column transform from numeric to discrete (discretization)

In [None]:
import numpy as np

In [None]:
df["age_category"]=pd.cut(df.Age,[0,14,25,60,np.inf],labels=["children","youth","adults","senior_citizens"])
df.head()

In [None]:
sns.countplot(df["age_category"]);

### of which age people travelled more?
## adults

## Fare

In [None]:
df["Fare"].agg(["min","max","mean"])

- average fare of passengers : 32

In [None]:
sns.distplot(df["Fare"]);

In [None]:
sns.distplot(df["Fare"],kde=False);

In [None]:
sns.boxplot(df["Fare"]);

In [None]:
df["Fare"].skew()

In [None]:
df["Fare"].nunique()

- 0-50 was average fare of travellers

### bivariate analysis of PClass and Age

In [None]:
import pandas as pd

In [None]:
df.groupby("Pclass")["Age"].mean()

### barplot

### through visualization

In [None]:
sns.barplot(df["Pclass"],df["Age"]);

## find out the avg age of male and female

In [None]:
df.groupby("Sex")["Age"].mean()

In [None]:
sns.barplot(df["Sex"],df["Age"]);

## find out the avg age of the passengers who survived or not survived

In [None]:
df.groupby("Survived")["Age"].mean()

In [None]:
sns.barplot(df["Survived"],df["Age"]);

## of which age male/female survived or not   ## multivariate analysis

In [None]:
df.groupby(["Survived","Sex"])["Age"].mean()

In [None]:
sns.boxplot(df["Sex"],df["Age"],hue=df["Survived"]);

### comparison of passenger age with respect to survived or not survived

In [None]:
age_surv=df[df["Survived"]==1]["Age"]
age_nsurv=df[df["Survived"]==0]["Age"]
sns.distplot(age_surv,label="survived")
sns.distplot(age_nsurv,label="not_survived")
plt.legend();

- children have more probability to be alive
- young age people died more

In [None]:
df.groupby("Embarked")["Age"].mean()

- through visualization

In [None]:
sns.relplot(df["Age"],df["Embarked"]);

- Insights:
- 80 aged person travelled in S
- More people travelled in S
- Less people travelled in Q compared to S and C

In [None]:
sns.barplot(df["Embarked"],df["Age"]);

In [None]:
df.groupby(["Embarked","Sex"])["Age"].mean()

In [None]:
sns.relplot(df["Age"],df["Embarked"],hue=df["Sex"],data=df);

- Insights: max aged and min aged person were male

In [None]:
sns.relplot(df["Age"],df["Embarked"],hue=df["Survived"],data=df);

- Max and min aged person both survived 

In [None]:
sns.boxplot(df["Sex"],df["Age"],hue=df["Embarked"]);

- comparison of avg passenger age with respect to embarked and sex

In [None]:
df.groupby(["Embarked","Sex"])["Age"].mean()

##  Age and Fare - both numeric, plot scatter

In [None]:
sns.scatterplot(x=df["Age"],y=df["Fare"])

- children fare is constant
- young people fare was more compared to others

- 1. find out the average fare of each pclass of passengers
- 2. find out the average fare spent by male and female of each class
- 3. of which age groups(age_category) people died most
- 4. of which salutations people died most
- 5. who died most males or females(survived and sex)
- 6. of which embarked passengers died more
- 7. find out the min, max and average fare spent by survived  or not survived passengers
- 8. how many males or females survived of each class
- 9. how fare is related with family memebers(both numeric)

In [None]:
# 1 find out the average fare of each pclass of passengers
df.groupby("Pclass")["Fare"].mean()

In [None]:
sns.barplot(df["Pclass"],df["Fare"]);

In [None]:
# 2 find out the average fare spent by male and female of each class
df.groupby(["Pclass","Sex"])["Fare"].mean()

In [None]:
sns.barplot(df["Sex"],df["Fare"],hue=df["Pclass"]);

In [None]:
# 3 of which age groups(age_category) people died most
df.groupby("Survived")["age_category"].value_counts()

In [None]:
sns.countplot(df["Survived"],hue=df["age_category"],data=df);

- mostly adults have been died

In [None]:
# 4 of which salutations people died most
df.groupby("Survived")["salutation"].value_counts()

In [None]:
sns.countplot(df["Survived"],hue=df["salutation"],data=df);

- Men(Mr) died most

In [None]:
# 5 who died most males or females(survived and sex)
df.groupby(["Sex"])["Survived"].value_counts()

In [None]:
sns.countplot(df["Survived"],hue=df["Sex"],data=df);

- Male survived less

In [None]:
# 6 of which embarked passengers died more
df.groupby(["Survived"])["Embarked"].value_counts()

In [None]:
sns.countplot(df["Survived"],hue=df["Embarked"],data=df);

- S port people died most

In [None]:
# 7 find out the min, max and average fare spent by survived or not survived passengers
df.groupby(["Survived"])["Fare"].agg(["min","max","mean"])

In [None]:
sns.barplot(df["Survived"],df["Fare"])

In [None]:
sns.barplot(df["Survived"],df["Fare"],estimator=min)

In [None]:
sns.barplot(df["Survived"],df["Fare"],estimator=max)

In [None]:
# 8 how many males or females survived of each class
df.groupby(["Sex","Pclass"])["Survived"].value_counts()

In [None]:
sns.barplot(df["Sex"],df["Survived"],hue="Pclass",data=df);

In [None]:
# 9 how fare is related with family memebers(both numeric)
sns.scatterplot(x="family_members",y="Fare",data=df);

In [None]:
df[["family_members","Fare"]].corr()

In [None]:
sns.heatmap(df[["family_members","Fare"]].corr(),annot=True)