In [1]:
import pandas as pd
from collections import defaultdict
import os
from nexus.utils import io_utils
os.environ["CONFIG_FILE_PATH"] = "config_test.yaml" 
os.chdir(f"/Users/yuegong/nexus_correlation_discovery")

In [2]:
# load correlations
datasource_name = 'data_commons_no_unionable'
all_correlations = pd.read_csv(f'{datasource_name}_correlations.csv')
# rank variables by the number of correlations they are associated with
count_map = defaultdict(list)
# iterate each row in all correlations
for index, row in all_correlations.iterrows():
    # get the two variables
    var1 = (row['table_id1'], row['agg_table1'], row['agg_attr1'])
    var2 = (row['table_id2'], row['agg_table2'], row['agg_attr2'])
    # increment the count for each variable
    count_map[var1].append(index)
    count_map[var2].append(index)
# sort the variables by the length of the list of correlations they are associated with
sorted_vars = sorted(count_map, key=lambda x: len(count_map[x]), reverse=True)
print(len(sorted_vars))

181


In [3]:
# print variables whose count is larger than 10
cnt = 0
var_desc = io_utils.load_json('resource/data_commons/variable_lookup.json')
for var in sorted_vars[:20]:
    print((var[0], var[2], var_desc[var[2][4:]] if var[2][4:] in var_desc else None), len(count_map[var]))

('00021_Social_Chicago', 'avg_ViolentRate1000', 'Violent rate, denom =  Tot_Population_ACS by ACS year span') 62
('00021_Social_Chicago', 'avg_CrimeRate1000', 'Crime rate, denom =  Tot_Population_ACS by ACS year span') 58
('00071_Health_Chicago', 'avg_TEETHLOST_CrudePrev', None) 55
('00091_Indices_Chicago', 'avg_CRCI_sd', 'FEMA Community Resilience Challenges Index (CRCI) - Std.Dev.') 55
('00041_Behavioral_psychological_Chicago', 'avg_SLEEP', 'Sleeping less than 7 hours among adults aged >=18 years') 55
('00021_Social_Chicago', 'avg_Violent', 'Count of all violent crime') 55
('00061_Housing_Chicago', 'avg_hardship', '(no definition) Hardship index') 54
('00071_Health_Chicago', 'avg_STROKE_CrudePrev', None) 54
('00061_Housing_Chicago', 'avg_BldgViolRate1000', 'Building violation rate per 1,000 houses. Denominator is "Tot_Housing_Units_ACS"') 54
('00031_Environment_pollution_Chicago', 'avg_nn_q3_pm2_5', 'Average PM 2.5 estimates during summer (Jun-Aug) from 2014-2018, calculated using a 

In [4]:
# control for each variables
from nexus.utils.data_model import Variable
from nexus.data_search.search_corr import Correlation
threshold = 10
variable = sorted_vars[0]
control_var = Variable(variable[1], variable[2], var_name=variable[2])
# select the corresponding list of indices from a data frame
cur_corrs = all_correlations.loc[count_map[variable]]
correlations = []
for index, row in cur_corrs.iterrows():
    correlations.append(Correlation.from_csv(row))

In [5]:
from nexus.nexus_api import API
datasource_name = 'data_commons_no_unionable'
data_sources = [datasource_name]
conn_str = f'data/{datasource_name}.db'
nexus_api = API(conn_str, data_sources=[datasource_name])
print(control_var.attr_name)
res = nexus_api.control_variables_for_correlaions([control_var], correlations)
print(len(res))



avg_ViolentRate1000
1511


In [18]:
all_correlations = pd.read_csv(f'{datasource_name}_correlations.csv')
candidates = [x for x in sorted_vars if len(count_map[x]) > 50]
changes = {}
for index, row in all_correlations.iterrows():
    var1 = (row['agg_table1'], row['agg_attr1'])
    var2 = (row['agg_table2'], row['agg_attr2'])
    key = str(tuple(sorted([var1, var2])))
    changes[key] = {}
    changes[key]["correlation coef"] = row['correlation coefficient']
    changes[key]["partial correlation"] = {}

for candidate in candidates:
    control_var = Variable(candidate[1], candidate[2], var_name=candidate[2])
    cur_corrs = all_correlations.loc[count_map[candidate]]
    correlations = []
    for index, row in cur_corrs.iterrows():
        correlations.append(Correlation.from_csv(row))
    res = nexus_api.control_variables_for_correlaions([control_var], correlations)
    comparison = defaultdict(list)
    all_vars = set()
    for index, row in cur_corrs.iterrows():
        var1 = (row['agg_table1'], row['agg_attr1'])
        var2 = (row['agg_table2'], row['agg_attr2'])
        if var1 != (control_var.tbl_id, control_var.var_name):
            all_vars.add(var1)
        if var2 != (control_var.tbl_id, control_var.var_name):
            all_vars.add(var2)

    for index, row in all_correlations.iterrows():
        var1 = (row['agg_table1'], row['agg_attr1'])
        var2 = (row['agg_table2'], row['agg_attr2'])
        if var1 in all_vars and var2 in all_vars:
            key = str(tuple(sorted([var1, var2])))
            comparison[key].append(row['correlation coefficient'])
            
    for index, row in res.iterrows():
        var1 = (row['agg_table1'], row['agg_attr1'])
        var2 = (row['agg_table2'], row['agg_attr2'])
        key = str(tuple(sorted([var1, var2])))
        if key in comparison:
            comparison[key].append(row['correlation coefficient'])
            comparison[key].append(row['p value'])
        if key not in changes:
            continue
        if candidate not in changes[key]["partial correlation"]:
            changes[key]["partial correlation"][str(candidate)] = {}
        changes[key]["partial correlation"][str(candidate)]["correlation coef"] = row['correlation coefficient']
        changes[key]["partial correlation"][str(candidate)]["p value"] = row['p value']
        changes[key]["partial correlation"][str(candidate)]["delta"] = changes[key]["correlation coef"] - row['correlation coefficient']

    decrease_cnt = 0
    increase_cnt = 0
    no_value = 0
    not_significant = 0
    beyond_cnt = 0
    for k, v in comparison.items():
        if len(v) == 1:
            no_value += 1
        elif len(v) > 1:
            if v[2] > 0.05:
                not_significant += 1
                # continue
            before_control, after_control = v[0], v[1]
            if abs(before_control) > abs(after_control):
                decrease_cnt += 1
            else:
                increase_cnt += 1
            if abs(after_control) >= 0.5:
                beyond_cnt += 1
    print(candidate, len(comparison), no_value, not_significant, decrease_cnt, increase_cnt, beyond_cnt)

io_utils.dump_json(f"{datasource_name}_control.json", changes)

('00021_Social_Chicago', '00021_Social_Chicago_GEOID10_3', 'avg_ViolentRate1000') 726 1 20 725 0 291
('00021_Social_Chicago', '00021_Social_Chicago_GEOID10_3', 'avg_CrimeRate1000') 705 0 2 705 0 320
('00071_Health_Chicago', '00071_Health_Chicago_GEOID10_3', 'avg_TEETHLOST_CrudePrev') 536 0 36 536 0 128
('00091_Indices_Chicago', '00091_Indices_Chicago_GEOID10_3', 'avg_CRCI_sd') 544 0 8 544 0 87
('00041_Behavioral_psychological_Chicago', '00041_Behavioral_psychological_Chicago_GEOID10_3', 'avg_SLEEP') 441 0 26 440 1 77
('00021_Social_Chicago', '00021_Social_Chicago_GEOID10_3', 'avg_Violent') 650 0 1 650 0 327
('00061_Housing_Chicago', '00061_Housing_Chicago_GEOID10_3', 'avg_hardship') 589 3 51 586 0 106
('00071_Health_Chicago', '00071_Health_Chicago_GEOID10_3', 'avg_STROKE_CrudePrev') 523 0 5 523 0 136
('00061_Housing_Chicago', '00061_Housing_Chicago_GEOID10_3', 'avg_BldgViolRate1000') 601 0 16 601 0 144
('00031_Environment_pollution_Chicago', '00031_Environment_pollution_Chicago_GEOID10