In [1]:
import pandas as pd
import seaborn as sns

## Correlation analysis
Correlation analysis is a statistical technique used to examine the strength and direction of the relationship between two or more variables. It involves analysing the degree to which changes in one variable are associated with changes in another variable.

### Pandas .corr method
Calculates the correlation coefficients between all paris of features in a dataframe, and returns a new dataframe where the number of rows and columns both match the number of columns in the input df.

In [5]:
# Import the cleaned encoded data from the previous Code-Along.  
titanic = pd.read_json("../Data/titanic_encoded.json")
correlation_matrix = titanic.corr().abs()
correlation_matrix 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone,who_child,who_man,who_woman
survived,1.0,0.338481,0.543351,0.077221,0.035322,0.081629,0.257307,0.108669,0.338481,0.55708,0.294804,0.163517,1.0,0.203367,0.136107,0.55708,0.506562
pclass,0.338481,1.0,0.1319,0.369226,0.083081,0.018443,0.5495,0.043835,1.0,0.094035,0.743251,0.157112,0.338481,0.135207,0.12192,0.094035,0.177049
sex,0.543351,0.1319,1.0,0.093254,0.114631,0.245489,0.182333,0.118593,0.1319,0.908578,0.118282,0.104057,0.543351,0.303646,0.111141,0.908578,0.896214
age,0.077221,0.369226,0.093254,1.0,0.308247,0.189119,0.096067,0.012186,0.369226,0.280328,0.267987,0.025252,0.077221,0.19827,0.582875,0.280328,0.105081
sibsp,0.035322,0.083081,0.114631,0.308247,1.0,0.414838,0.159651,0.060606,0.083081,0.253586,0.041333,0.066654,0.035322,0.584471,0.352437,0.253586,0.047071
parch,0.081629,0.018443,0.245489,0.189119,0.414838,1.0,0.216225,0.07932,0.018443,0.349943,0.031308,0.038322,0.081629,0.583398,0.351481,0.349943,0.150167
fare,0.257307,0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.063462,0.5495,0.182024,0.525994,0.221226,0.257307,0.271832,0.003753,0.182024,0.191243
embarked,0.108669,0.043835,0.118593,0.012186,0.060606,0.07932,0.063462,1.0,0.043835,0.110351,0.042195,0.760766,0.108669,0.018867,0.012545,0.110351,0.125425
class,0.338481,1.0,0.1319,0.369226,0.083081,0.018443,0.5495,0.043835,1.0,0.094035,0.743251,0.157112,0.338481,0.135207,0.12192,0.094035,0.177049
adult_male,0.55708,0.094035,0.908578,0.280328,0.253586,0.349943,0.182024,0.110351,0.094035,1.0,0.098553,0.088725,0.55708,0.404744,0.394747,1.0,0.814281


In [8]:
# Setting correlation threshold. Anything above this is considered as highly correlated.
threshold = 0.85

In [9]:
# To find which columns we can remove as they are superflous
import numpy as np

upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
upper.style.map(lambda x: "font-weight: bold" if x > threshold else "")
upper
# This did not work as expected (bold)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone,who_child,who_man,who_woman
survived,,0.338481,0.543351,0.077221,0.035322,0.081629,0.257307,0.108669,0.338481,0.55708,0.294804,0.163517,1.0,0.203367,0.136107,0.55708,0.506562
pclass,,,0.1319,0.369226,0.083081,0.018443,0.5495,0.043835,1.0,0.094035,0.743251,0.157112,0.338481,0.135207,0.12192,0.094035,0.177049
sex,,,,0.093254,0.114631,0.245489,0.182333,0.118593,0.1319,0.908578,0.118282,0.104057,0.543351,0.303646,0.111141,0.908578,0.896214
age,,,,,0.308247,0.189119,0.096067,0.012186,0.369226,0.280328,0.267987,0.025252,0.077221,0.19827,0.582875,0.280328,0.105081
sibsp,,,,,,0.414838,0.159651,0.060606,0.083081,0.253586,0.041333,0.066654,0.035322,0.584471,0.352437,0.253586,0.047071
parch,,,,,,,0.216225,0.07932,0.018443,0.349943,0.031308,0.038322,0.081629,0.583398,0.351481,0.349943,0.150167
fare,,,,,,,,0.063462,0.5495,0.182024,0.525994,0.221226,0.257307,0.271832,0.003753,0.182024,0.191243
embarked,,,,,,,,,0.043835,0.110351,0.042195,0.760766,0.108669,0.018867,0.012545,0.110351,0.125425
class,,,,,,,,,,0.094035,0.743251,0.157112,0.338481,0.135207,0.12192,0.094035,0.177049
adult_male,,,,,,,,,,,0.098553,0.088725,0.55708,0.404744,0.394747,1.0,0.814281


In [14]:
columns_to_drop = []

for i in range(len(correlation_matrix.columns)):
    column_name = correlation_matrix.columns[i]
    # print(column_name, end=": ")
    for j in range(i):
        # print(correlation_matrix.iloc[i,j], end=", ")
        if correlation_matrix.iloc[i,j] > threshold:
            columns_to_drop.append(column_name)
            break
    # print()
columns_to_drop

survived: pclass: sex: age: sibsp: parch: fare: embarked: class: adult_male: deck: embark_town: alive: alone: who_child: who_man: who_woman: 

['class', 'adult_male', 'alive', 'who_man', 'who_woman']

In [15]:
titanic.drop(columns=columns_to_drop)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,embark_town,alone,who_child
0,0,3,1,22.0,1,0,7.2500,0.0,7,2,0,0
1,1,1,0,38.0,1,0,71.2833,1.0,2,0,0,0
2,1,3,0,26.0,0,0,7.9250,0.0,7,2,1,0
3,1,1,0,35.0,1,0,53.1000,0.0,2,2,0,0
4,0,3,1,35.0,0,0,8.0500,0.0,7,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0.0,7,2,1,0
887,1,1,0,19.0,0,0,30.0000,0.0,1,2,1,0
888,0,3,0,,1,2,23.4500,0.0,7,2,0,0
889,1,1,1,26.0,0,0,30.0000,1.0,2,0,1,0
