In [1]:
from nexus.nexus_api import API
from nexus.utils.time_point import TEMPORAL_GRANU
from nexus.utils.coordinate import SPATIAL_GRANU
import os
import pandas as pd
import duckdb
from IPython.display import display, HTML
pd.options.display.max_colwidth = 200
os.environ["CONFIG_FILE_PATH"] = "config_test.yaml" 
os.chdir(f"/Users/yuegong/nexus_correlation_discovery")

datasource_name = 'data_commons_no_unionable' # only 00011_acs1019_chicago.csv is kept
# datasource_name = 'data_commons' # this version has unionable tables

# Add the Data Source

In [2]:
API.add_data_source(data_source_name=datasource_name, 
                    data_path='/Users/yuegong/nexus_correlation_discovery/data/Data Commons/cureated deduplicate/')

# Load Data to a database

In [3]:
data_sources = [datasource_name]
conn_str = f'data/{datasource_name}.db'
temporal_granu_l = []
spatial_granu_l = [SPATIAL_GRANU.TRACT]
nexus_api = API(conn_str, data_sources=[datasource_name])
from nexus.data_ingestion.data_profiler import Profiler
profiler = Profiler(db_engine=nexus_api.db_engine, data_source=data_sources[0], mode='no_cross')
print("begin collecting agg stats")
profiler.collect_agg_tbl_col_stats(temporal_granu_l, spatial_granu_l)
print("begin profiling original data")
profiler.profile_original_data()
# API.ingest_data(conn_str=conn_str, engine='duckdb', data_sources=data_sources,
#                 temporal_granu_l=temporal_granu_l, spatial_granu_l=spatial_granu_l)

begin collecting agg stats


100%|██████████| 14/14 [00:00<00:00, 86.40it/s]


begin profiling original data


100%|██████████| 14/14 [00:01<00:00, 11.96it/s]


# Initiate Nexus API and Begin to Query Correlations 

In [4]:
conn_str = f'data/{datasource_name}.db'
nexus_api = API(conn_str, data_sources=[datasource_name])

# Find correlations from a single table

In [5]:
# test find correlations from a single table
dataset = '00021_Social_Chicago'
# asthma data only has spatial attribute, thus the temporal granularity is set to ALL.
temporal_granularity, spatial_granularity = TEMPORAL_GRANU.ALL, SPATIAL_GRANU.TRACT
overlap_threshold = 100
correlation_threshold = 0.5
# you can change correlation_type to 'spearman' or 'kendall'
correlations = nexus_api.find_correlations_from(dataset, temporal_granularity, spatial_granularity, 
                                      overlap_threshold, correlation_threshold, 
                                      correlation_type="pearson")
display(correlations.head())
correlations.to_csv(f"{dataset}_correlations.csv", index=False)

total number of correlations: 389


Unnamed: 0,table_id1,table_name1,agg_table1,agg_attr1,description1,original_attr1_missing_ratio,table_id2,table_name2,agg_table2,agg_attr2,description2,original_attr2_missing_ratio,correlation coefficient,p value,number of samples,spatio-temporal key type
0,00021_Social_Chicago,00021_Social_Chicago,00021_Social_Chicago_GEOID10_3,avg_SDOH_CL,,0.0,00061_Housing_Chicago,00061_Housing_Chicago,00061_Housing_Chicago_GEOID10_3,avg_OwnerOccPct,,0.0,-0.516,0.0,1362,spatial
1,00021_Social_Chicago,00021_Social_Chicago,00021_Social_Chicago_GEOID10_3,avg_SDOH_CL,,0.0,00061_Housing_Chicago,00061_Housing_Chicago,00061_Housing_Chicago_GEOID10_3,avg_mode_Neighborhood Code,,0.0,0.518,0.0,1362,spatial
2,00021_Social_Chicago,00021_Social_Chicago,00021_Social_Chicago_GEOID10_3,avg_SDOH_CL,,0.0,00061_Housing_Chicago,00061_Housing_Chicago,00061_Housing_Chicago_GEOID10_3,avg_mode_Township Code,,0.0,0.519,0.0,1362,spatial
3,00021_Social_Chicago,00021_Social_Chicago,00021_Social_Chicago_GEOID10_3,avg_SDOH_CL,,0.0,00061_Housing_Chicago,00061_Housing_Chicago,00061_Housing_Chicago_GEOID10_3,avg_pop_density_sqmi,,0.0,0.505,0.0,1362,spatial
4,00021_Social_Chicago,00021_Social_Chicago,00021_Social_Chicago_GEOID10_3,avg_SDOH_CL,,0.0,00061_Housing_Chicago,00061_Housing_Chicago,00061_Housing_Chicago_GEOID10_3,avg_avg_Tax Rate,,0.0,0.512,0.0,1362,spatial


# Find all correlations within the data collection

In [10]:
from nexus.data_search.commons import FIND_JOIN_METHOD

nexus_api.data_sources = [datasource_name]
temporal_granularity, spatial_granularity = None, SPATIAL_GRANU.TRACT
overlap_threshold = 100
correlation_threshold = 0.3
persist_path = 'tmp/test/'
all_correlations = nexus_api.find_all_correlations(temporal_granularity, spatial_granularity,
                                        overlap_threshold, correlation_threshold,
                                        persist_path=persist_path, correlation_type="pearson",
                                        find_join_method=FIND_JOIN_METHOD.JOIN_ALL)
all_correlations.to_csv(f'{datasource_name}_correlations_above_0.3.csv', index=False)

  0%|          | 0/13 [00:00<?, ?it/s]

skip because this table does not have enough keys


100%|██████████| 13/13 [00:03<00:00,  4.25it/s]


total number of correlations: 5705


In [7]:
all_correlations = pd.read_csv('data_commons_correlations.csv')
all_correlations['agg_attr1'] = all_correlations['agg_attr1'].str[4:]
all_correlations['agg_attr2'] = all_correlations['agg_attr2'].str[4:]
all_correlations.rename(columns={'agg_attr1': 'attribute1',
                   'agg_attr2': 'attribute2'},
          inplace=True)
all_correlations.to_csv(f'{datasource_name}_correlations_04_15.csv', index=False)

# Correlation Clustering Analysis

## Generate Clusters

In [8]:
from demo.cluster_utils import CorrCommunity

corr_community = CorrCommunity(all_correlations)
corr_community.get_correlation_communities()
clusters = corr_community.all_communities
print("number of clusters:", len(clusters))

KeyError: 'agg_attr1'

## Browse Clusters

In [None]:
cluster_id = 14
cluster = clusters[f"Cluster {cluster_id}"]
print("=============== Tables ===============")
for table in cluster.keys():
    print(table)
print("=============== Variables ===============")
for table, variables in cluster.items():
    print(f"{table}")
    for var in variables:
        print(f" - {var}")
print("=============== Correlations ===============")
res = corr_community.get_corr_in_cluster_i(
                cluster_id, show_corr_in_same_tbl=False
            )
display(f"Cluster {cluster_id} has {len(res)} correlations")
display(res)

00051_Business_economy_transportation_Chicago
00031_Environment_pollution_Chicago
00071_Health_Chicago
00051_Business_economy_transportation_Chicago
 - avg_EnergyCount
 - avg_ETOHLicCount
 - avg_BldgPermitsN
 - avg_MfgLicCount
 - avg_BusLicCount
00031_Environment_pollution_Chicago
 - avg_asbestosN
 - avg_HazLicCount
 - avg_CANCER
 - avg_RSEICount
 - avg_comp_water
 - avg_MfgCount
 - avg_HazOver80CountUs
 - avg_comp_air
 - avg_comp_asbestos
 - avg_HazardCount
 - avg_comp_recycling
00071_Health_Chicago
 - avg_RxCount


'Cluster 14 has 27 correlations'

Unnamed: 0,table_name1,agg_attr1,description1,table_name2,agg_attr2,description2,correlation coefficient,number of samples,spatio-temporal key type
2,00051_Business_economy_transportation_Chicago,avg_ETOHLicCount,"Business licenses: Alcohol, tavern",00031_Environment_pollution_Chicago,avg_HazardCount,Hazardous waste count,0.542,801,spatial
4,00051_Business_economy_transportation_Chicago,avg_ETOHLicCount,"Business licenses: Alcohol, tavern",00031_Environment_pollution_Chicago,avg_CANCER,Air toxics cancer risk,0.512,801,spatial
5,00051_Business_economy_transportation_Chicago,avg_ETOHLicCount,"Business licenses: Alcohol, tavern",00031_Environment_pollution_Chicago,avg_comp_asbestos,Chicago environmental complaints: asbestos,0.503,801,spatial
6,00051_Business_economy_transportation_Chicago,avg_BldgPermitsN,Building permits,00031_Environment_pollution_Chicago,avg_asbestosN,CDPH asbestos and demolition notifications: count by tract,0.801,801,spatial
7,00051_Business_economy_transportation_Chicago,avg_BldgPermitsN,Building permits,00031_Environment_pollution_Chicago,avg_comp_asbestos,Chicago environmental complaints: asbestos,0.76,801,spatial
8,00051_Business_economy_transportation_Chicago,avg_BldgPermitsN,Building permits,00031_Environment_pollution_Chicago,avg_HazardCount,Hazardous waste count,0.742,801,spatial
10,00051_Business_economy_transportation_Chicago,avg_BldgPermitsN,Building permits,00031_Environment_pollution_Chicago,avg_comp_air,Chicago environmental complaints: air pollution,0.575,801,spatial
14,00051_Business_economy_transportation_Chicago,avg_BldgPermitsN,Building permits,00071_Health_Chicago,avg_RxCount,,0.52,801,spatial
15,00051_Business_economy_transportation_Chicago,avg_BldgPermitsN,Building permits,00031_Environment_pollution_Chicago,avg_CANCER,Air toxics cancer risk,0.502,801,spatial
16,00051_Business_economy_transportation_Chicago,avg_EnergyCount,Energy benchmarking,00031_Environment_pollution_Chicago,avg_asbestosN,CDPH asbestos and demolition notifications: count by tract,0.604,801,spatial
