In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Import the Data
Let's extract the our **Titanic data** from the .csv file, create a  pandas DataFrame and look at the available indicators:

- ***Survived***: Outcome of survival (0 = No; 1 = Yes)
- ***Pclass***: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)
- ***Name***: Name of passenger
- ***Sex***: Sex of the passenger
- ***Age***: Age of the passenger (Some entries contain NaN)
- ***SibSp***: Number of siblings and spouses of the passenger aboard
- ***Parch***: Number of parents and children of the passenger aboard
- ***Ticket***: Ticket number of the passenger
- ***Fare***: Fare paid by the passenger
- ***Cabin***: Cabin number of the passenger (Some entries contain NaN)
- ***Embarked***: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)

In [None]:
df = pd.read_csv("titanic_dataset.csv")
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

## Distribution plots

In [None]:
df['Parch'].unique()

In [None]:
sns.histplot(df['Parch'],kde=True)
plt.show()

**As we can see, most passengers don't have neither parents nor children aboard.**

In [None]:
sns.distplot(df['Age'], hist=False)
plt.show()

**As we can see that most of the passenger has the age between 20 to 40**

In [None]:
plt.figure(figsize=(8,8))
sns.distplot(df['Age'])
plt.show()

In [None]:
df.isnull().sum()

In [None]:
# Heatmap
sns.heatmap(df.isnull(),yticklabels = False, cbar = False,cmap = 'tab20c_r')
plt.title('Missing Data')
plt.show()

In [None]:
df['Age'].median()

In [None]:
df['Age'].mean()

In [None]:
sns.boxplot(df['Age'])

In [None]:
plt.figure(figsize = (10,7))
sns.boxplot(x = 'Pclass', y = 'Age', data = df, palette= 'GnBu_d').set_title('Age by Passenger Class')
plt.show()

In [None]:
# Imputation function
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]

    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age


In [None]:
# Apply the function to the Age column
df['Age']=df[['Age','Pclass']].apply(impute_age, axis =1)

In [None]:
df['Age'].isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
# Remove Cabin feature
df.drop('Cabin', axis = 1, inplace = True)

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.info()

In [None]:
df = df.dropna()

In [None]:
df.info()

## **Relational plots**

In [None]:
sns.relplot(x="Age", y="Fare", col="Pclass", hue="Sex", style="Sex",kind="line", data=df)
plt.show()

In [None]:
# scatter can be used instead of "line" plot
sns.relplot(x="Age", y="Fare", col="Pclass", hue="Sex", style="Sex", data=df)
plt.show()

## **Scatterplot**

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x="Age", y="Fare", hue="Sex", data=df)
plt.show()

## **lineplot**

In [None]:
plt.figure(figsize=(8,8))
sns.lineplot(x="Age", y="Fare", hue="Sex", style="Sex", data=df)
plt.show()

# **Categorical Plot**

### **barplot**

In [None]:
plt.figure(figsize=(8,8))
sns.barplot(x="Sex", y="Survived", hue="Pclass", data=df)
plt.show()

**As we can see, More women survived than men.**

### **stripplot**

In [None]:
plt.figure(figsize=(8,8))
sns.stripplot(x="Sex", y="Age",hue='Sex', data=df)
plt.show()

### **swarmplot**

In [None]:
plt.figure(figsize=(8,8))
sns.swarmplot(x="Sex", y="Age",hue='Sex', data=df)
plt.show()

**We can said that more passengers are approximally between 18 and 40 years old.**

### **violinplot**

Violin Plots are a combination of the box plot with the kernel density estimates. So, these plots are easier to analyze and understand the distribution of the data.

In [None]:
sns.violinplot(x="Survived", y="Age", hue='Sex', data=df)
plt.show()

### **Countplot**

In [None]:
sns.countplot(x="Survived", data=df, palette="Blues");
plt.show()

In [None]:
sns.countplot(x = "Pclass", hue = "Sex",data = df)
plt.show()

In [None]:
sns.countplot(y = "Sex", data = df)
plt.show()

## **Multi-plot grids**

### **Facet grids**

In [None]:
# initialize the FacetGrid object
g = sns.FacetGrid(df, col='Survived', row='Pclass')

g.map(plt.hist, 'Age')
g.add_legend()
plt.show()

In [None]:
sns.pairplot(data=df)
plt.show()

## **Matrix plots**

### **heatmap**

In [None]:
df.corr()

In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])
numeric_df

In [None]:
numeric_df.dtypes

In [None]:
numeric_df.corr()

In [None]:
sns.heatmap(numeric_df.corr(), cmap="YlGnBu", annot=True, fmt=".2f")
plt.show()