# Setup

In [None]:
import sys
import os
home_dir=os.path.expanduser('~')
os.chdir(f"{home_dir}/nexus_correlation_discovery/")
from utils.time_point import T_GRANU
from utils.coordinate import S_GRANU
from demo.demo_ui import show_df
from nexus_api import API
from data_search.data_model import Var
from sklearn import linear_model

use_qgrid = True

# Create Nexus API

In [None]:
conn_str = "postgresql://yuegong@localhost/chicago_1m_zipcode"
nexus_api = API(conn_str)

## Browse Data Assets

You can use Nexus to browse data catalog and an individual dataset.

### Download output data products

Every dataframe displayed by Nexus is associated with a download button. After the button is clicked, the dataframe will be downloaded to the root directory of `nexus_correlation_discovery`. The name of the downloaded dataframe is the one you specified in the name field of `show_df`.

In [None]:
# show catalog
catalog = nexus_api.show_catalog()
show_df(catalog, name="catalog", use_qgrid=use_qgrid)

In [None]:
# show original dataset
dataset_id = 'divg-mhqk'
df, link=nexus_api.show_raw_dataset(id=dataset_id)
print(link)
show_df(df, name=dataset_id, use_qgrid=use_qgrid)

In [None]:
"""
Show aggregated dataset

For example, 4u6w-irs9_location_6 is an aggreagted table, 
which is created over original table 4u6w-irs9 by aggregating its spatial attribute `location` 
to the zipcode granularity (zipcode granularity is mapped to 6 in Nexus).
"""
agg_tbl_name = '4u6w-irs9_location_6'
df = nexus_api.show_agg_dataset(agg_tbl_name)
show_df(df, name=agg_tbl_name, use_qgrid=use_qgrid)

# Find correlations from an input table

## Inputs:
- `dataset`: dataset id
- `t_granu`: temporal granularity
- `s_granu`: spatial granularity
- `overlap_t`: overlap threshold for joinable detection
- `r_t`: correlation coefficient threshold
- `corr_type`: correlation type: `pearson`, `spearman`, `kendall`

## Outputs:
A list of of correlations, and each correlation has the following attributes.

- `tbl_id`: table id, `table_name`: table name
- `agg_tbl`: the table name of the aggregated table. For example, 4u6w-irs9_location_6 is an aggreagted table, which is created over original table 4u6w-irs9 by aggregating its spatial attribute `location` to the zipcode granularity (zipcode granularity is mapped to 6 in Nexus).

- `r_val` is the correlation coefficient.

- `p_val` is the p value for the correlation

- `missing_ratio_o2` is the fraction of missing values in the original attribute before any aggregation.

- `samples` is the number of rows used to calculate the correlation.


In this example, the input is the asthma dataset. We align the input with tables from Chicago open data and compute correlations. Tables from chicago open data originally have the spatial granularity of geo-coordinate. We aggregate them to the zipcode level and apply aggregate functions "avg" and "count". For example, if you see an attribute named "avg_basketball_courts", it means the original attribute is "basketball_courts" and function "avg" is applied. The attribute after aggregation is named "avg_basketball_courts". In the displayed dataframe, you can perform sorting on one dimension, filtering rows using keywords, and etc.

In [None]:
dataset = 'asthma'
t_granu, s_granu = None, S_GRANU.ZIPCODE
overlap_t = 5
r_t = 0.5
# you can change corr_type to 'spearman' or 'kendall'
df = nexus_api.find_correlations_from(dataset, t_granu, s_granu, overlap_t, r_t, corr_type="pearson")
df_formatted = show_df(df, name='asthma_corrs', use_qgrid=use_qgrid)

# Assemble a dataset from multiple variables

1. df, prov = get_aligned_data(correlation): this api takes input as a correlation and returns the merged dataset used to calculate this correlation

2. df, prov = assemble(vars, constraints: [optional]): this api creates a dataset that merges all variables specified in `vars`. `constaints` is a mapping between table name and the constraint on that table when performing join operation. For example {'tbl_A': 2} means spatio-temporal units with the number of samples smaller than 2 are discarded. 

data assembly APIs return `prov`, which is the provenance information of the resulting dataset.


In [None]:
row_idx = 0
aligned, prov = nexus_api.get_aligned_data(df.loc[0])
show_df(aligned, name="asthma_corrs_aligned", prov=prov, use_qgrid=use_qgrid)

In [None]:
# without constraint
vars = [Var('divg-mhqk_location_6', 'count'), Var('4u6w-irs9_location_6', 'avg_square_feet')]
df, prov = nexus_api.assemble(vars)
show_df(df, name="divg-mhqk_4u6w-irs9", prov=prov, use_qgrid=use_qgrid)

In [None]:
# with constraint, units with number of samples smaller than 2 are dropped
vars = [Var('divg-mhqk_location_6', 'count'), Var('4u6w-irs9_location_6', 'avg_square_feet')]
constraints = {'divg-mhqk_location_6': 2, '4u6w-irs9_location_6': 2}
df, prov = nexus_api.assemble(vars, constraints)
show_df(df, name="divg-mhqk_4u6w-irs9_sample_greater_than_2", prov=prov, use_qgrid=use_qgrid)

# Regression Analysis

When you find several interesting correlations, and want to do further regression analysis over variables of interest, you can use the `regress` api of Nexus:

model, rsq, data = nexus_api.regress(dep_var, ind_vars, reg_model)

Input:

`dep_var`: the dependent variable in a regression analysis

`ind_vars`: independent variables (regressors)

`reg_model`: regression model

Return:

`model`: resulting model

`rsq`: r squared score

`data`: dataset that used to do the regression (this dataset is assembled from merging all varaibles that are initially scattered in different tables )

In [None]:
dep_var = Var('asthma_Zip5_6', 'avg_enc_asthma')
ind_vars = [Var('ijzp-q8t2_location_6', 'count'), Var('n26f-ihde_pickup_centroid_location_6', 'avg_tip')]
reg_model = linear_model.LinearRegression() # OLS regression
model, rsq, merged = nexus_api.regress(dep_var, ind_vars, reg_model)
print("coefficients of each independent variables:", model.coef_)
print("r square score:", rsq)