# Set up


## Imports

In [1]:
%matplotlib inline

In [2]:
import sys
import os
from pathlib import Path
import math

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(context="talk", font_scale=1, rc=None)
sns.set_style('whitegrid')

In [4]:
# Load the "autoreload" extension
%load_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
from pandas import IndexSlice as idx
import numpy as np

from collections import OrderedDict

from munch import Munch
from tqdm import tqdm

In [6]:
# import crunchers.sklearn_helpers.exploration as expl
# import crunchers.statsmodels_helpers.lazy_stats as stats
# import crunchers.pandas_helpers.transformations as xform

In [9]:
import biorep_etl.data.load_recode as loading

## Constants

### Get info for converting/moving this file to `reports`

In [None]:
# %%javascript
# var kernel = IPython.notebook.kernel;
# var thename = window.document.getElementById("notebook_name").innerHTML;
# var command = "IPYNB_NAME = " + "'"+thename+"'";
# kernel.execute(command);

In [None]:
# rv = Munch()

# rv.rep_id = 'yyy-mm-dd'
# rv.desc = 'blank'
# rv.ipnb = Path('{IPYNB_NAME}.ipynb'.format(IPYNB_NAME=IPYNB_NAME))
# rv.html = '{ipnb}.html'.format(ipnb=rv.ipnb.stem)
# rv.files = '{ipnb}_files'.format(ipnb=rv.ipnb.stem)
# rv.reprt_dir = '../../reports/{IPYNB_NAME}/'.format(IPYNB_NAME=IPYNB_NAME)

### Other Constants

## Paths

In [11]:
data_ = "../data/raw/SnapperGIBioreposito_DATA_2016-12-22_1633.csv"
labels_ = "../data/raw/SnapperGIBioreposito_DATA_LABELS_2016-12-22_1633.csv"

## Functions

## Loading

In [39]:
data = loading.init_load_data_and_labels(data_=data_, labels_=labels_)

# Explore

In [40]:
def extract_lvl1(col):
    if '___' in col:
        return col.split('___')[0]
    else:
        return col

In [50]:
premidx = [[extract_lvl1(col), col] for col in data.data.columns.values]

In [51]:
list(zip(*premidx[:6]))

[('record_id',
  'samplenumber',
  'label_on_sample',
  'mrn',
  'biorepidnumber',
  'completed_by'),
 ('record_id',
  'samplenumber',
  'label_on_sample',
  'mrn',
  'biorepidnumber',
  'completed_by')]

In [52]:
midx = pd.MultiIndex.from_arrays(list(zip(*premidx)), names=['a','b'])

In [53]:
columns_orig = data.data.columns

In [54]:
data.data.columns = midx
data.data.head()

a,record_id,samplenumber,label_on_sample,mrn,biorepidnumber,completed_by,date_crf_completed,sample_date,registryconsent,biorepconsent,...,othermed,date_labs,hct,wbc,plt,crp,esr,alb,notes,ibd_biorepository_sample_database_complete
b,record_id,samplenumber,label_on_sample,mrn,biorepidnumber,completed_by,date_crf_completed,sample_date,registryconsent,biorepconsent,...,othermed,date_labs,hct,wbc,plt,crp,esr,alb,notes,ibd_biorepository_sample_database_complete
0,1,5231301000.0,1004,2127951,1004.0,,,2012-12-25,0.0,1.0,...,Lo ovral,,37.5,8.32,288.0,0.47,13.0,4.3,,2
1,2,8051400000.0,433.2-MX,2035198,433.2,,,2014-04-08,0.0,1.0,...,,,,,,,,,,2
2,3,8161400000.0,433.2-NX,2035198,433.2,,,2014-07-11,0.0,1.0,...,,,,,,,,,,2
3,4,4241300000.0,1,1042672,1.0,,,2013-02-20,1.0,1.0,...,INH,,31.4,2.83,233.0,0.09,19.0,4.3,,2
4,5,1151300000.0,2,2237636,2.0,,,2012-03-10,1.0,1.0,...,Ursodeoxycholic acid,,40.2,6.42,310.0,0.29,13.0,4.7,,2


In [59]:
data.data.T

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,2794,2795,2796,2797,2798,2799,2800,2801,2802,2803
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
record_id,record_id,1,2,3,4,5,6,7,8,9,10,...,2825,2826,2827,2828,2829,2830,2831,2832,2833,2834
samplenumber,samplenumber,5.2313e+09,8.0514e+09,8.1614e+09,4.2413e+09,1.1513e+09,7.0313e+09,1.11213e+10,1.11213e+10,9.1013e+09,4.0913e+09,...,1.21416e+10,1.21416e+10,1.21416e+10,1.21916e+10,1.21916e+10,1.21916e+10,8.0516e+09,8.0516e+09,8.2316e+09,1.13016e+10
label_on_sample,label_on_sample,1004,433.2-MX,433.2-NX,1,2,3,4,4,5,7,...,CHB746,CHB747,2153,1597,BCH1597,1899,1919,1919,1919,1748
mrn,mrn,2127951,2035198,2035198,1042672,2237636,4243292,4300961,4300961,4322619,4038229,...,4530344,5002658,4854538,4528601,4528601,4950076,4559381,4559381,4559381,2106899
biorepidnumber,biorepidnumber,1004,433.2,433.2,1,2,3,4,4,5,7,...,1995,1995.2,2153,1597,1597,1899,1919.1,1919,1919,1748
completed_by,completed_by,,,,,,,,,,,...,8,8,8,8,8,5,2,2,2,3
date_crf_completed,date_crf_completed,,,,,,,,,,,...,2016-01-02,2016-09-11,2016-08-02,2016-09-14,2016-10-07,2016-07-18,2016-07-24,2016-08-12,2016-08-21,2016-07-21
sample_date,sample_date,2012-12-25,2014-04-08,2014-07-11,2013-02-20,2012-03-10,2013-02-01,2013-08-13,2013-06-19,2013-05-18,2012-11-05,...,2016-01-01,2016-09-10,2016-08-01,2016-09-14,2016-10-07,2016-07-18,2016-03-09,2016-03-28,2016-04-24,2016-06-30
registryconsent,registryconsent,0,0,0,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,1,1,0
biorepconsent,biorepconsent,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [28]:
def make_midx(df):
    pass
#      lvl1 = 

    

# Convert to HTML report and move to reports folder

In [None]:
# rv.reprt_dir

In [None]:
# !jupyter nbconvert --to html_toc $rv.ipnb

In [None]:
# !mkdir -p $rv.reprt_dir
## !mv -f $rv.html $rv.files $rv.reprt_dir