# Import libraries

In [1]:
# python 3.7.4
import os, errno
import numpy as np # 1.17.2
import pandas as pd # 0.25.1

# Import data and categorize features

In [7]:
# Set the correlation coefficient threshold
THRES = 0.3
# Set Top n correlation coefficient to be selected
TOP = 10
# Outputs
OUTPUTS = ['TA4', 'TA5', 'KNO6', 'PEERW', 'PEERC', 'TASKM', 'TASKG']

# Create directory for output files
try:
    os.makedirs('results')
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

try:
    os.makedirs('results_group')
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

# Load data and make a copy
df = pd.read_csv('survey.csv')
df_copy = df.copy()

# Factorize all categorical features
df_copy = df_copy.apply(lambda x : pd.factorize(x)[0])

# Drop clomun ID
df_copy = df_copy.drop(columns=['ID'])

# Correlation coeffiecient between each feature and all outputs

In [3]:
# Iterate all outputs and calculate the correlation coefficient between each feature and each output
for label in OUTPUTS:
    corr = df_copy.corr(method='pearson', min_periods=1)[label]   
    # Drop out features with correlation coefficient < 0.3
    corr = corr[corr.abs() >= THRES]
    # Select Top 10 features
    corr_sorted = corr.reindex(corr.abs().sort_values(ascending=False).index).drop(labels=[label]).nlargest(TOP)
    # Save to csv file
    pd.DataFrame(corr_sorted).to_csv(path_or_buf='./results/'+label+'.csv')

# Correlation coeffiecient between each feature group and all outputs (Non linear regression model)

In [6]:
GROUPS = ['GP', 'TA', 'AGE', 'GEN', 'RACE', 'ETH', 'EDU', 'TIE', 'KNO', 'BFX', 'WEIP', 'PEER', 'TASK']

def non_linear_reg(group, output, threshold=THRES):
    # Copy the file to the other dataframe
    df1 = df_copy.copy()
    df1 = df1.iloc[:,0:-1]
    # Select features from the same group and drop any column if in the outputs
    df1 = df1.filter(regex=group+'[0-9]+')
    for out in OUTPUTS:
        if out in df1.columns:
            df1 = df1.drop([out], axis=1)
            
    # Copy another dataframe to iterate
    df2 = df1.copy()
    
    # Multiply each columns value to make a dataframe for the x1^2 and x1*x2 value
    for (name_1, data_1) in df2.iteritems():
        if name_1 not in OUTPUTS:
            for (name_2,data_2) in df2.iteritems():
                if name_2 not in OUTPUTS:
                    if (df2.columns.get_loc(str(name_2))) >= (df2.columns.get_loc(str(name_1))):
                        df1[str(name_1)+'*' +str(name_2)] = data_1*data_2

    # Append the output column
    df1 = df1.join(df_copy[output])

    # Calculate correlation coeeficients
    df2 = df1.copy()
    columns = []
    corr = []
    for (name, data) in df2.iteritems():
        columns.append(name)
        corr.append(df2[str(name)].corr(df2[output]))
    corr = np.array(corr).reshape(1,len(corr))

    # Build the correlation dataframe
    corr_df = pd.DataFrame(np.array(corr),columns=columns)
    print("The correlation dataframe between group " + group + ' and output ' + output)
    print(corr_df)

#     # Filter features according to the threshold
#     drop_name = []
#     for (name, data) in corr_df.iteritems():
#         if np.abs(data.values) < threshold:
#             drop_name.append(name)
#     df_filtered = df2.drop(columns=drop_name)
    
    corr_df.to_csv('./results_group/'+group+'_'+output+'_corr.csv')
# #     Filter correlations according to the threshold
#     corr_df_filtered = corr_df.drop(columns=drop_name)

# #     Print data (including x^2 and x1*x2 items) filtered by correlation coeeficients threshold
#     print("There are " + str(df_filtered.shape[1]) + " columns left with "+str(threshold) + " threshold")
#     print("Filtered results:")
#     print(df_filtered)
#     df_filtered.to_csv('./results_group/group_'+group+'_'+output+'.csv')

In [8]:
# Analyze each group with each output
for group in GROUPS:
    for out in OUTPUTS:
        non_linear_reg(group, out)

The correlation dataframe between group GP and output TA4
        GP1       GP2       GP3       GP4       GP5       GP6       GP7  \
0  0.415516  0.415516  0.164448  0.495126  0.348865  0.297683 -0.052382   

        GP8   GP1*GP1   GP1*GP2  ...   GP5*GP6   GP5*GP7   GP5*GP8   GP6*GP6  \
0  0.091287  0.451511  0.451511  ...  0.304692  0.189948  0.299943  0.409091   

    GP6*GP7   GP6*GP8   GP7*GP7   GP7*GP8   GP8*GP8  TA4  
0  0.241475  0.071495 -0.096825  0.012783  0.091287  1.0  

[1 rows x 45 columns]
The correlation dataframe between group GP and output TA5
        GP1       GP2      GP3       GP4       GP5       GP6      GP7  \
0  0.084921  0.084921  0.10008  0.070261 -0.016024  0.179277  0.62695   

        GP8   GP1*GP1   GP1*GP2  ...   GP5*GP6   GP5*GP7   GP5*GP8   GP6*GP6  \
0 -0.111111  0.108226  0.108226  ...  0.161324  0.191059 -0.190476  0.148605   

    GP6*GP7   GP6*GP8   GP7*GP7   GP7*GP8   GP8*GP8  TA5  
0  0.390735 -0.032855  0.608898  0.054455 -0.111111  1.0  

[1 r

The correlation dataframe between group AGE and output TASKM
      AGE1      AGE2      AGE3  AGE1*AGE1  AGE1*AGE2  AGE1*AGE3  AGE2*AGE2  \
0  0.33142 -0.107622  0.191293   0.171756   0.044765   0.159737  -0.108777   

   AGE2*AGE3  AGE3*AGE3  TASKM  
0   0.079157   0.125989    1.0  
The correlation dataframe between group AGE and output TASKG
       AGE1      AGE2      AGE3  AGE1*AGE1  AGE1*AGE2  AGE1*AGE3  AGE2*AGE2  \
0  0.242756  0.077582  0.038723    0.27951   0.257474   0.150879  -0.008659   

   AGE2*AGE3  AGE3*AGE3  TASKG  
0   0.098852   -0.03375    1.0  
The correlation dataframe between group GEN and output TA4
       GEN1      GEN2      GEN3  GEN1*GEN1  GEN1*GEN2  GEN1*GEN3  GEN2*GEN2  \
0 -0.242241 -0.283311  0.058202  -0.242241  -0.255699  -0.063558  -0.257922   

   GEN2*GEN3  GEN3*GEN3  TA4  
0  -0.057378   0.073699  1.0  
The correlation dataframe between group GEN and output TA5
       GEN1      GEN2      GEN3  GEN1*GEN1  GEN1*GEN2  GEN1*GEN3  GEN2*GEN2  \
0  0.272166 

The correlation dataframe between group EDU and output PEERC
      EDU4      EDU1      EDU2      EDU3  EDU4*EDU4  EDU4*EDU1  EDU4*EDU2  \
0  0.49162  0.397298  0.340769  0.320774   0.552315   0.590671   0.381561   

   EDU4*EDU3  EDU1*EDU1  EDU1*EDU2  EDU1*EDU3  EDU2*EDU2  EDU2*EDU3  \
0   0.400839   0.320454   0.473641   0.453177   0.309378   0.205445   

   EDU3*EDU3  PEERC  
0   0.261672    1.0  
The correlation dataframe between group EDU and output TASKM
       EDU4     EDU1    EDU2      EDU3  EDU4*EDU4  EDU4*EDU1  EDU4*EDU2  \
0  0.089642  0.09616  0.2473  0.431871   0.093593   0.005598   0.117443   

   EDU4*EDU3  EDU1*EDU1  EDU1*EDU2  EDU1*EDU3  EDU2*EDU2  EDU2*EDU3  \
0    0.22275    0.00832   0.064063   0.188344   0.235681   0.375877   

   EDU3*EDU3  TASKM  
0   0.398222    1.0  
The correlation dataframe between group EDU and output TASKG
       EDU4      EDU1      EDU2      EDU3  EDU4*EDU4  EDU4*EDU1  EDU4*EDU2  \
0  0.077876 -0.215506 -0.052567  0.486036   0.046814  -0.08

The correlation dataframe between group BFX and output TA5
       BFX1      BFX2      BFX3      BFX4      BFX5      BFX6     BFX7  \
0 -0.046269 -0.263447  0.194444  0.053682  0.063101 -0.107811  0.25963   

       BFX8      BFX9     BFX10  ...  BFX12*BFX13  BFX12*BFX14  BFX12*BFX15  \
0  0.187831 -0.019212  0.122632  ...    -0.021491    -0.056125    -0.112538   

   BFX13*BFX13  BFX13*BFX14  BFX13*BFX15  BFX14*BFX14  BFX14*BFX15  \
0     0.050142    -0.040206    -0.093886    -0.093538    -0.230055   

   BFX15*BFX15  TA5  
0    -0.137555  1.0  

[1 rows x 136 columns]
The correlation dataframe between group BFX and output KNO6
       BFX1      BFX2      BFX3      BFX4      BFX5      BFX6      BFX7  \
0  0.380319  0.304585  0.263863  0.183212  0.229414  0.461362 -0.015452   

       BFX8      BFX9     BFX10  ...  BFX12*BFX13  BFX12*BFX14  BFX12*BFX15  \
0  0.348141  0.236402 -0.095567  ...    -0.047614     0.199019     0.120116   

   BFX13*BFX13  BFX13*BFX14  BFX13*BFX15  BFX14*BFX14 

     PEER1     PEER2     PEER3     PEER4     PEER5     PEER6  PEER1*PEER1  \
0 -0.25631  0.117609 -0.015074  0.016723  0.100017  0.282478    -0.281684   

   PEER1*PEER2  PEER1*PEER3  PEER1*PEER4  ...  PEER3*PEER4  PEER3*PEER5  \
0    -0.130558    -0.265694    -0.197242  ...    -0.149467    -0.068656   

   PEER3*PEER6  PEER4*PEER4  PEER4*PEER5  PEER4*PEER6  PEER5*PEER5  \
0     0.018519    -0.014036     -0.08055     0.022111    -0.008712   

   PEER5*PEER6  PEER6*PEER6  TA5  
0     0.127294     0.248581  1.0  

[1 rows x 28 columns]
The correlation dataframe between group PEER and output KNO6
      PEER1     PEER2     PEER3     PEER4     PEER5     PEER6  PEER1*PEER1  \
0  0.253969  0.091837 -0.049766  0.305428  0.396036 -0.050147     0.240185   

   PEER1*PEER2  PEER1*PEER3  PEER1*PEER4  ...  PEER3*PEER4  PEER3*PEER5  \
0     0.447004    -0.107725     0.205641  ...     0.240523     0.146999   

   PEER3*PEER6  PEER4*PEER4  PEER4*PEER5  PEER4*PEER6  PEER5*PEER5  \
0    -0.180199     0.