# Analyse Correlations in the Dataset
## **A Notebook for Finding Correlations Across Demographic Categories and Income/Wage**

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
correlational_df = pd.read_csv('preprocessed_data/correlational_analysis_data.csv')
correlational_df.drop(columns='Unnamed: 0', inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305121 entries, 0 to 305120
Data columns (total 20 columns):
DIVISION    305121 non-null object
AGEP        305121 non-null object
CIT         305121 non-null object
COW         180138 non-null object
DEAR        305121 non-null object
DEYE        305121 non-null object
FER         66094 non-null object
JWMNP       131524 non-null object
MAR         305121 non-null object
SCHL        305121 non-null object
SEX         305121 non-null object
WAGP        305121 non-null float64
WKHP        305121 non-null float64
WKW         305121 non-null object
DIS         305121 non-null object
INDP        180138 non-null object
OC          299166 non-null object
PINCP       305121 non-null float64
RAC1P       305121 non-null object
WAOB        305121 non-null object
dtypes: float64(3), object(17)
memory usage: 46.6+ MB


In [10]:
regression_df = pd.read_csv('preprocessed_data/regression_data.csv')
regression_df.drop(columns='Unnamed: 0', inplace=True)

0         52
1         49
2         60
3         37
4         54
          ..
305116    19
305117    89
305118    58
305119    26
305120    68
Name: AGEP, Length: 305121, dtype: int64

In [7]:
correlation_matrix = regression_df.corr()
# sns.heatmap(correlation_matrix, xticklabels=correlation_matrix.columns, yticklabels=correlation_matrix.columns, cmap='RdBu_r',annot=True,linewidth=0)
#Correlation with output variable
cor_target = abs(correlation_matrix["PINCP"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.25]
relevant_features

AGEP                           0.259407
JWMNP                          0.288945
WAGP                           0.860196
WKHP                           0.442525
PINCP                          1.000000
COW[N/A]                       0.327048
MAR[Married]                   0.284411
MAR[Never Married/Under 15]    0.303162
SCHL[No HS Diploma]            0.280376
WKW[50 to 52 Weeks]            0.402492
WKW[No Work/Under 16 Years]    0.360737
INDP[N/A]                      0.327048
OC[Has Children]               0.269165
OC[No Children]                0.279869
Name: PINCP, dtype: float64

In [21]:
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

def oneway_ANOVA(regression_df):
    AGEP = regression_df['AGEP']
    JWMNP = regression_df['JWMNP']
    PINCP = regression_df['PINCP']
    results = stats.f_oneway(AGEP,JWMNP,PINCP)
    fstatistic = results[0]
    pvalue = results[1]
    df1 = 3
    df2 = (len(AGEP) - 1) + (len(JWMNP) - 1) + (len(PINCP) - 1)
    levene_results = stats.levene(AGEP,JWMNP,PINCP)
    shapiro_AGEP = stats.shapiro(AGEP)
    shapiro_JWMNP = stats.shapiro(JWMNP)
    shapiro_PINCP = stats.shapiro(PINCP)
    mc = MultiComparison([regression_df['PINCP'],regression_df['JWMNP'],regression_df['AGEP']], [AGEP,JWMNP,PINCP])
    tukey_result = mc.tukeyhsd()
    return df1,df2,format(pvalue, '.5f'),format(fstatistic,'.2f'),levene_results,shapiro_AGEP,shapiro_JWMNP,shapiro_PINCP,print(tukey_result)

In [22]:
oneway_ANOVA(regression_df)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 3 and the array at index 1 has size 915363

U might wanna try out an 18-way ANOVA if u want, that could save time
Then you only have to go do 1-way ANOVAS on the catégories that were significant from that 18-way test