[jupyter notebook](correlation.ipynb)

# Correlation or association between the columns of a dataframe 

In [1]:
import pandas as pd
import numpy as np

np.random.seed( 15 )
df = pd.DataFrame( { 'X1': np.random.choice( 50, 100 ),
                     'X2': np.random.choice( 50, 100 ), 
                     'X3': np.random.choice( [ 'a3' , 'b3' ,  'c3' ], 100 ), 
                     'X4': np.random.choice( [ 'a4' , 'b4' ,  'c4' ], 100 ) } )

pd.set_option( 'display.max_rows', 10 )
display( df)

Unnamed: 0,X1,X2,X3,X4
0,8,18,c3,b4
1,12,39,b3,b4
2,5,28,a3,b4
3,0,26,b3,b4
4,28,18,a3,b4
...,...,...,...,...
95,20,40,a3,b4
96,46,43,c3,b4
97,38,10,b3,c4
98,35,6,b3,c4


### Correlation between two continuous columns (X1 and X2)

In [2]:
pearsonCorr = df[ [ 'X1', 'X2' ] ].corr( method = 'pearson' )
print( '***** Pearson correlation *****\n' )
print( pearsonCorr )
print()

spearmanCorr = df[ [ 'X1', 'X2' ] ].corr( method = 'spearman' )
print( '***** Spearman correlation *****\n' )
print( spearmanCorr )


***** Pearson correlation *****

          X1        X2
X1  1.000000 -0.141702
X2 -0.141702  1.000000

***** Spearman correlation *****

          X1        X2
X1  1.000000 -0.139973
X2 -0.139973  1.000000


### Association between two categorical columns (X3 and X4)

In [3]:
from scipy.stats import chi2_contingency

# Find contingency table
cTab = pd.crosstab( df['X3'], df['X4'] )
print( '***** Contingency table of columns X3 and X4 *****\n' )
print( cTab )

# Chi-square test for null hypothesis (H0) of no association between columns X3 and X4
chi2, pVal, dof, e = chi2_contingency( cTab )
print( '\n***** Chi-square test *****\n')
print( 'p-value = ', pVal )

sigLev = 0.05
if pVal < sigLev:
    print( 'There is association between X3 and X4' )
else:
    print( 'There is no association between X3 and X4' )

print( '\n***** Cramer\'s V *****\n')
n = cTab.sum().sum() # Total number of observations
r, k = cTab.shape # Number of rows and columns

V = np.sqrt( ( chi2 / n ) / min( r - 1, k - 1 ) )

print( 'Cramer\'s V = ', V )

***** Contingency table of columns X3 and X4 *****

X4  a4  b4  c4
X3            
a3   8  14  10
b3   8  14  12
c3  11  13  10

***** Chi-square test *****

p-value =  0.928165368015445
There is no association between X3 and X4

***** Cramer's V *****

Cramer's V =  0.06613157422490289


### Correlation between a continuous and a categorical column (X1 and X3)

In [4]:
# Kruskal-Wallis H-test between columns X1 and X3

from scipy.stats import kruskal

grouped = df.groupby( by = 'X3' )[ 'X1' ]
values = [ list( g ) for n, g in grouped ]

# Kruskal-Wallis H-test for null hypothesis (H0) of no correlation between columns X1 and X3
stat, pVal = kruskal( *values )

print( '\n***** Kruskal-Wallis H-test *****\n')
print( 'p-value = ', pVal )

sigLev = 0.05
if pVal < sigLev:
    print( 'There is correlation between X1 and X3' )
else:
    print( 'There is no correlation between X1 and X3' )


***** Kruskal-Wallis H-test *****

p-value =  0.6648145720335333
There is no correlation between X1 and X3
