# Setup

In [None]:
import os
home_dir=os.path.expanduser('~')
os.chdir(f"{home_dir}/nexus_correlation_discovery/")
from utils.time_point import TEMPORAL_GRANU
from utils.coordinate import SPATIAL_GRANU
from demo.demo_ui import show_df
from nexus_api import API
from utils.data_model import Variable
from sklearn import linear_model
import warnings
from corr_analysis.graph.graph_utils import filter_on_signals
from utils.io_utils import load_corrs_from_dir
from demo.cluster_utils import CorrCommunity
from demo.demo_ui import show_communities
import pickle

warnings. filterwarnings('ignore')

use_qgrid = False

# Create Nexus API

In [None]:
conn_str = "postgresql://yuegong@localhost/chicago_1m_zipcode"
nexus_api = API(conn_str)

## Browse Data Assets

You can use Nexus to browse data catalog and an individual dataset.

### Download output data products

Every dataframe displayed by Nexus is associated with a download button. After the button is clicked, the dataframe will be downloaded to the root directory of `nexus_correlation_discovery`. The name of the downloaded dataframe is the one you specified in the name field of `show_df`.

In [None]:
# show catalog
catalog = nexus_api.show_catalog()
show_df(catalog, name="catalog", use_qgrid=use_qgrid)

In [None]:
# show original dataset
dataset_id = 'divg-mhqk'
df, link=nexus_api.show_raw_dataset(id=dataset_id)
print(link)
show_df(df, name=dataset_id, use_qgrid=use_qgrid)

In [None]:
"""
Show aggregated dataset

For example, 4u6w-irs9_location_6 is an aggreagted table, 
which is created over original table 4u6w-irs9 by aggregating its spatial attribute `location` 
to the zipcode granularity (zipcode granularity is mapped to 6 in Nexus).
"""
agg_tbl_name = '4u6w-irs9_location_6'
df = nexus_api.show_agg_dataset(agg_tbl_name)
show_df(df, name=agg_tbl_name, use_qgrid=use_qgrid)

# Find correlations from an input table

## Inputs:
- `dataset`: dataset id
- `temporal_granularity`: temporal granularity
- `spatial_granularity`: spatial granularity
- `overlap_threshold`: overlap threshold for joinable detection
- `correlation_threshold`: correlation coefficient threshold
- `correlation_type`: correlation type: `pearson`, `spearman`, `kendall`
- `control_variables`: variables that you want to control for. When `control_variables` is specified, partial(conditional) correlations are calculated w.r.t control varaibles.

## Outputs:
A list of of correlations, and each correlation has the following attributes.

- `table_id`: table id, `table_name`: table name

- `agg_table`: the table name of the aggregated table. For example, 4u6w-irs9_location_6 is an aggreagted table, which is created over original table 4u6w-irs9 by aggregating its spatial attribute `location` to the zipcode granularity (zipcode granularity is mapped to 6 in Nexus).

- `agg_attr`: the attribute after aggregation.

- `correlation coefficient` is the correlation coefficient.

- `p value` is the p value for the correlation

- `original_attribute_missing_ratio` is the fraction of missing values in the original attribute before any aggregation.

- `number of samples` is the number of rows used to calculate the correlation.

- `spatio-temporal key type` indicates whether this correlation by spatial alignment or temporal alignment or both.


In this example, the input is the asthma dataset. We align the input with tables from Chicago open data and compute correlations. Tables from chicago open data originally have the spatial granularity of geo-coordinate. We aggregate them to the zipcode level and apply aggregate functions "avg" and "count". For example, if you see an attribute named "avg_basketball_courts", it means the original attribute is "basketball_courts" and function "avg" is applied. The attribute after aggregation is named "avg_basketball_courts". In the displayed dataframe, you can perform sorting on one dimension, filtering rows using keywords, and etc.

In [None]:
dataset = 'asthma'
# asthma data only has spatial attribute, thus the temporal granularity is set to ALL.
temporal_granularity, spatial_granularity = TEMPORAL_GRANU.ALL, SPATIAL_GRANU.ZIPCODE
overlap_threshold = 5
correlation_threshold = 0.5
# you can change correlation_type to 'spearman' or 'kendall'
df = nexus_api.find_correlations_from(dataset, temporal_granularity, spatial_granularity, 
                                      overlap_threshold, correlation_threshold, 
                                      correlation_type="pearson")
show_df(df, name='asthma_corrs', use_qgrid=use_qgrid)

## Control for variables

We got 227 correlations for the asthma dataset. After browsing several correlations, we realize that "poverty" might be driving these correlations after going through these correlations. Thus, we want to control for the income level of each zipcode when calculating correlations. To achieve that, users can specify variables that they want to control for in the `control_variables` field. 

In [None]:
dataset = 'asthma'
temporal_granularity, spatial_granularity = TEMPORAL_GRANU.ALL, SPATIAL_GRANU.ZIPCODE
overlap_threshold = 5
correlation_threshold = 0.5
control_variables = [Variable('chicago_income_by_zipcode_zipcode_6', 'avg_income_household_median')]
df_control = nexus_api.find_correlations_from(dataset, temporal_granularity, spatial_granularity, 
                                              overlap_threshold, correlation_threshold, 
                                              correlation_type="pearson", control_variables=control_variables)
show_df(df_control, name='asthma_corrs_control_income', use_qgrid=use_qgrid)

# Assemble a dataset from multiple variables

1. df, prov = get_aligned_data(correlation): this api takes input as a correlation and returns the merged dataset used to calculate this correlation

2. df, prov = assemble(vars, constraints: [optional]): this api creates a dataset that merges all variables specified in `vars`. `constaints` is a mapping between table name and the constraint on that table when performing join operation. For example {'tbl_A': 2} means spatio-temporal units with the number of samples smaller than 2 are discarded. 

data assembly APIs return `prov`, which is the provenance information of the resulting dataset.


In [None]:
row_idx = 0
aligned, prov = nexus_api.get_joined_data_from_row(df.loc[0])
show_df(aligned, name="asthma_corrs_aligned", prov=prov, use_qgrid=use_qgrid)

In [None]:
# without constraint
vars = [Variable('divg-mhqk_location_6', 'count'), Variable('4u6w-irs9_location_6', 'avg_square_feet')]
df, prov = nexus_api.join_and_project(vars)
show_df(df, name="divg-mhqk_4u6w-irs9", prov=prov, use_qgrid=use_qgrid)

In [None]:
# with constraint, units with number of samples smaller than 2 are dropped
vars = [Variable('divg-mhqk_location_6', 'count'), Variable('4u6w-irs9_location_6', 'avg_square_feet')]
constraints = {'divg-mhqk_location_6': 2, '4u6w-irs9_location_6': 2}
df, prov = nexus_api.join_and_project(vars, constraints)
show_df(df, name="divg-mhqk_4u6w-irs9_sample_greater_than_2", prov=prov, use_qgrid=use_qgrid)

# Regression Analysis

When you find multiple intriguing correlations and wish to conduct further regression analysis on variables of interest, you can begin by utilizing Nexus's `join_and_project` function to compile the necessary dataset. Subsequently, you may employ any data analysis library for regression analysis. In this instance, we will illustrate the process using `scikit-learn`.

In [None]:
dependent_variable = Variable('asthma_Zip5_6', 'avg_enc_asthma')
independent_variables = [Variable('ijzp-q8t2_location_6', 'count'), Variable('n26f-ihde_pickup_centroid_location_6', 'avg_tip')]

data_to_analyze, provenance = nexus_api.join_and_project([dependent_variable] + independent_variables)

# apply any data anlysis method
regression_model = linear_model.LinearRegression() # OLS regression

x = df[[variable.attr_name for variable in independent_variables]]
y = df[dependent_variable.attr_name]
model = regression_model.fit(x, y)
r_squared = model.score(x, y)

print("coefficients of each independent variables:", model.coef_)
print("r square score:", r_squared)

# Analyze Correlations

In this section, we will explain how to analyze output correlations in Nexus. We will use the correlations from chicago open data at the census tract and month granularity as an example.

In [None]:
# load correlations: corrs is a list of correlations; corr_map is map from correlated variables to their correlation coefficients
corr_path = "/home/cc/nexus_correlation_discovery/evaluation/correlations2/chicago_1m_T_GRANU.MONTH_S_GRANU.TRACT/"
corrs, corr_map = load_corrs_from_dir(corr_path) 

## Use Nexus Variable Clusters

Nexus searches for an optimal set of signals that, when applied as filters, yield a correlation graph with the highest modularity score. The signals that we consider for chicago open data include:

- Missing value ratio in the aggregated column
- Missing value ratio in the original column
- Zero value ratio in the aggregated column
- Zero value ratio in the original column
- The absolute value of correlation coefficient
- Overlap: number of samples used to calculate the correlation

In chicago open data, the best set of thresholds for the above signals are [1.0, 1.0, 1.0, 0.8, 0.6, 70], which means we include correlations whose missing_ratio <= 1.0, missing_ratio_original<=1.0, zero_ratio <=1.0, zero_ratio_original <= 0.8, |r| >= 0.6, |samples| >= 70.`

You can play with different set of thresholds as well!

In [None]:
signal_thresholds = [1.0, 1.0, 1.0, 0.8, 0.6, 70]
corr_community = CorrCommunity(corrs, 'chicago')
corr_community.get_correlation_communities_chicago(signal_thresholds)

### Examine Correlation Communities

We implement a simple interface for you to explore our correlation communities. Each community is composed of a group of variables. By default, the display is set to only show the tables where these variables are found. To view the specific variables within a community, simply click the "Show Variables" button.

Clicking the "Show Correlations" button will reveal all the correlations within a community. Once displayed, you have the flexibility to apply any filters to the resulting dataframe.

FAQ:

Why do some communities display the exact same set of tables?

The reason is that while the tables might be the same, the variables within these communities differ. We construct the correlation graph based on variables, and then present it in a table-view for clarity.

In [None]:
show_communities(corr_community, show_corr_in_same_tbl=False, use_qgrid=use_qgrid)

## Use Factor Analysis

Factor analysis aims to extract common factors from observed variables and represent existing variables using fewer factors. 

Factor analysis can take as input a correlation matrix. It derives factors that are essentially linear combinations of the observed variables. These factors are crafted to closely approximate the original correlation matrix when observed variables are projected onto them. 

In [None]:
# need to remove correlations that have values of 1 or -1 to avoid singular matrix
corrs, corr_map = load_corrs_from_dir(corr_path, remove_perfect_corrs=True) 
signals = [1.0, 1.0, 1.0, 0.8, 0.6, 70] # we use the same signal thresholds as in the previous example
corrs_filtered = filter_on_signals(corrs, None, signals)

n_factors = 10 # set the number of factors to 10

"""
the following code fits a factor analysis model on the correlation matrix online
It takes 10 minutes to run; save_path indicates the path to save the factor analysis model (fa)
"""
# fa, clusters = nexus_api.factor_analysis(corrs_filtered, corr_map, n_factors, save_path="chicago_open_data_factor_analysis.pkl")

"""
For the purpose of this demo, we load the factor analysis model from the file "chicago_open_data_factor_analysis.pkl"
"""
fa = pickle.load(open("chicago_open_data_factor_analysis.pkl", "rb"))
clusters, covered_vars = nexus_api.build_factor_clusters(fa, corrs_filtered, corr_map, n_factors, threshold=0.5)
corr_community = CorrCommunity(corrs_filtered, 'chicago', clusters)
show_communities(corr_community, show_corr_in_same_tbl=False, use_qgrid=use_qgrid)