In [1]:
import os
import dtale
home_dir=os.path.expanduser('~')
os.chdir(f"{home_dir}/nexus_correlation_discovery/")
from demo import nexus_demo
from nexus.utils.time_point import TEMPORAL_GRANU
from nexus.utils.coordinate import SPATIAL_GRANU
from nexus.nexus_api import API
from nexus.utils.data_model import Variable
from demo.cluster_utils import CorrCommunity
from demo.demo_ui import show_communities

## Create Nexus API

Nexus indexes [Chicago Open Data](https://data.cityofchicago.org/) offline and stores the data in `demo.db`.

In [2]:
conn_str = f'data/demo.db'
nexus_api = API(conn_str)

# Persona 1: Researcher exploring a hypothesis

A researcher from a medical school, has a dataset with asthma attack incidences in hospitals across various zip codes in Chicago.

| Zip5\*   | enc_asthma\*\* | encAsthmaExac\*\*\* | AttackPer\*\*\*\*  |
|--------|------------|---------------|-----------|
| 60604.0| 10.0       | 1.0           | 0.1       |
| 60605.0| 47.0       | 7.0           | 0.15      |
| 60606.0| 33.0       | 13.0          | 0.39      |
| 60607.0| 12.0       | 3.0           | 0.25      |
| ...| ...       | ...          | ...      |

\* zipcode

\*\* Count of asthma visits 2009-2019, denominator.

\*\*\* Count of visits for asthma attacks (a.k.a., exacerbations) 2009-2019, numerator.

\*\*\*\* Asthma attacks as a percentage of all asthma visits.

**The researcher wants to find variables correlated with asthma attacks from Chicago Open Data.**

<!-- He finds that [Chicago Open Data](https://data.cityofchicago.org/) has a wealth of datasets on diverse societal aspects such as education, business, and crime in Chicago. He believes there are some variables in Chicago Open Data that are useful for his research. Thus, he adds Chicago Open Data as a data source in Nexus. -->

## Browse Data Assets

In [3]:
catalog = nexus_api.get_catalog()
dtale.show(catalog)



You can use Nexus to look at a dataset in the catalog given the dataset id.

In [7]:
dataset_id = 'ijzp-q8t2_location_6'
df = nexus_api.get_agg_dataset(dataset_id)
dtale.show(df)



## Find correlations from an input table

In [8]:
dataset = 'asthma'
temporal_granularity, spatial_granularity = None, SPATIAL_GRANU.ZIPCODE
overlap_threshold = 5
correlation_threshold = 0.5
correlations = nexus_api.find_correlations_from(dataset, temporal_granularity, spatial_granularity, 
                                      overlap_threshold, correlation_threshold, 
                                      correlation_type="pearson")
dtale.show(correlations)

total number of correlations: 219




## Display the detailed profile of a correlation

In [None]:
correlation_idx = 9
nexus_api.show_correlation_profile(correlations, correlation_idx)

## Control for variables

In [9]:
control_variables = [Variable('chicago_income_by_zipcode_zipcode_6', 'avg_income_household_median')]
df_control = nexus_api.find_correlations_from(dataset, temporal_granularity, spatial_granularity, 
                                              overlap_threshold, correlation_threshold, 
                                              correlation_type="pearson", control_variables=control_variables)
dtale.show(df_control)

total number of correlations: 50




## Assemble a dataset from multiple variables

In [10]:
row_idx = 10
aligned, prov = nexus_api.get_joined_data_from_row(df_control.loc[row_idx])
dtale.show(aligned)



Nexus also offers `join_and_project` API that can assemble a dataset from any set of given variables.

In [11]:
variables = [Variable('divg-mhqk_location_6', 'count'), Variable('4u6w-irs9_location_6', 'avg_square_feet')]
df, prov = nexus_api.join_and_project(variables)
dtale.show(df)



Nexus provides the data provenance information for all data assembly APIs.

In [None]:
print(prov)

## Regression Analysis

When you find multiple intriguing correlations and wish to conduct further regression analysis on variables of interest, you can begin by utilizing Nexus's `join_and_project` function to compile the necessary dataset. Subsequently, you may employ any data analysis library for regression analysis. In this instance, we will illustrate the process using `scikit-learn`.

In [None]:
from sklearn import linear_model

dependent_variable = Variable('asthma_Zip5_6', 'avg_enc_asthma')
independent_variables = [Variable('ijzp-q8t2_location_6', 'count'), Variable('n26f-ihde_pickup_centroid_location_6', 'avg_tip')]

data_to_analyze, provenance = nexus_api.join_and_project([dependent_variable] + independent_variables)
# apply any data anlysis method
regression_model = linear_model.LinearRegression() # OLS regression

x = data_to_analyze[[variable.attr_name for variable in independent_variables]]
y = data_to_analyze[dependent_variable.attr_name]
model = regression_model.fit(x, y)
r_squared = model.score(x, y)

print("coefficients of each independent variables:", model.coef_)
print("r square score:", r_squared)

# Persona 2: Data-Driven Hypothesis Generation.

In [None]:
chicago_correlations = nexus_demo.find_all_correlations(TEMPORAL_GRANU.MONTH, SPATIAL_GRANU.TRACT)
print(f"Nexus found {len(chicago_correlations)} correlations in total")

## Correlation Distillation Using Nexus Variable Clusters

In [None]:
variable_clusters = nexus_demo.get_correlation_communities(chicago_correlations)
print(f"Nexus extracts {len(variable_clusters.comps)} variable clusters out of {len(chicago_correlations)} correlations")

### Examine Correlation Communities

In [None]:
show_communities(variable_clusters, show_corr_in_same_tbl=False)