# Set up


## Imports

In [1]:
%matplotlib inline

In [2]:
import sys
import os
from pathlib import Path
import math

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(context="talk", font_scale=1, rc=None)
sns.set_style('whitegrid')

In [4]:
# Load the "autoreload" extension
%load_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
from pandas import IndexSlice as idx
import numpy as np

from collections import OrderedDict

from munch import Munch
from tqdm import tqdm

In [6]:
# import crunchers.sklearn_helpers.exploration as expl
# import crunchers.statsmodels_helpers.lazy_stats as stats
# import crunchers.pandas_helpers.transformations as xform

In [7]:
import biorep_etl.data.load_recode as loading

## Constants

### Get info for converting/moving this file to `reports`

In [8]:
# %%javascript
# var kernel = IPython.notebook.kernel;
# var thename = window.document.getElementById("notebook_name").innerHTML;
# var command = "IPYNB_NAME = " + "'"+thename+"'";
# kernel.execute(command);

In [9]:
# rv = Munch()

# rv.rep_id = 'yyy-mm-dd'
# rv.desc = 'blank'
# rv.ipnb = Path('{IPYNB_NAME}.ipynb'.format(IPYNB_NAME=IPYNB_NAME))
# rv.html = '{ipnb}.html'.format(ipnb=rv.ipnb.stem)
# rv.files = '{ipnb}_files'.format(ipnb=rv.ipnb.stem)
# rv.reprt_dir = '../../reports/{IPYNB_NAME}/'.format(IPYNB_NAME=IPYNB_NAME)

### Other Constants

## Paths

In [10]:
data_ = "../data/raw/SnapperGIBioreposito_DATA_2016-12-22_1633.csv"
data_dict_ = "../data/raw/SnapperGIBiorepository_DataDictionary_2016-12-06.csv"

## Functions

## Loading

In [55]:
data_dict = loading.load_data_dict(data_dict_=data_dict_)

In [45]:
data = loading.init_load_data_and_labels(data_=data_, data_dict_=data_dict_)

# Explore

In [46]:
raw = data.data.copy()

In [70]:
raw.ibdyesno.value_counts()

1.0    1818
3.0     592
2.0     391
Name: ibdyesno, dtype: int64

In [73]:
raw.groupby(by=['biorepidnumber', 'ibdyesno']).count().record_id

biorepidnumber  ibdyesno
1.0             1.0          1
2.0             1.0          1
3.0             1.0          1
4.0             1.0          7
5.0             1.0          1
6.0             1.0          1
7.0             1.0          1
8.0             1.0          3
14.0            1.0          2
14.2            3.0          1
16.0            1.0          3
16.2            3.0          2
16.3            3.0          1
17.0            1.0          2
19.0            1.0          3
19.2            3.0          1
19.3            3.0          1
22.0            1.0          2
25.0            1.0          3
33.0            1.0          1
39.0            1.0          1
42.0            1.0         25
47.0            1.0          1
50.0            1.0          1
52.0            1.0          1
53.0            1.0          2
54.0            1.0          2
56.0            1.0          1
58.0            1.0          4
61.0            1.0          1
                            ..
2103.3        

In [54]:
data.field_map['familymember_ibd']

'Which family member(s)?'

In [59]:
checkboxes = data_dict[data_dict['Field Type'] == 'checkbox']
checkboxes`

Unnamed: 0,Variable / Field Name,Form Name,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Text Validation Min,Text Validation Max,Identifier?,Branching Logic (Show field only if...),Required Field?,Custom Alignment,Question Number (surveys only),Matrix Group Name,Matrix Ranking?,Field Annotation
15,familymember_ibd,ibd_biorepository_sample_database,,checkbox,Which family member(s)?,"1, Mother | 2, Father | 3, Full brother | 4, F...",,,,,,[family_history] = '1',,,,,,
36,oral_perianal,ibd_biorepository_sample_database,,checkbox,Does the patient have a history of oral or per...,"1, Oral | 2, Perianal",,,,,,[neopics] = '1',,,,,,
42,blood_samples,ibd_biorepository_sample_database,,checkbox,Which blood samples were collected?,"5, DNA | 3, PAX gene | 2, PBMCs | 1, Serum | 6...",,,,,,[sampletype] = '1',,,,,,
45,stoolmedia,ibd_biorepository_sample_database,,checkbox,What kind(s) of media were the samples stored in?,"1, Freeze | 2, RNA later | 3, Ethanol | 4, DNA...",,,,,,[sampletype] = '3',,,,,,
47,bxlocation,ibd_biorepository_sample_database,,checkbox,Where were samples obtained during the endosco...,"1, Stomach Antrum inflamed | 11, Stomach Antru...",,,,,,[sampletype] = '4',,,,,,
48,surgicallocations,ibd_biorepository_sample_database,,checkbox,What tissue locations were obtained from surgi...,"1, Ileum inflamed | 9, Ileum uninflamed | 2, C...",location of surgical resection samples,,,,,[sampletype] = '6',,,,,,
49,where_stored,ibd_biorepository_sample_database,,checkbox,Were the samples utilized immediately?,"1, Stored in the biorepository | 2, Given to C...",,,,,,[sampletype] = '1' or [sampletype] = '4' or [s...,,,,,,
86,asa_type,ibd_biorepository_sample_database,,checkbox,Which aminosalicylates are currently being used?,"1, Mesalamine (Asacol, Canasa, Pentasa, Rowasa...",,,,,,[current_5asa] = '1',,,,,,
88,antibiotic_type,ibd_biorepository_sample_database,,checkbox,Which antibiotic(s)?,"1, Amoxicillin (Amoxil), Amoxicillin/Clavulana...",,,,,,[current_antibiotic] = '1',,,,,,
91,probiotic_type,ibd_biorepository_sample_database,,checkbox,Which probiotic(s)?,"1, VSL3 | 2, LGG (Culturelle) | 3, Florastor |...",,,,,,[current_probiotic] = '1',,,,,,


## MultiIndex

In [12]:
def extract_lvl1(col):
    if '___' in col:
        return col.split('___')[0]
    else:
        return col

In [13]:
premidx = [[extract_lvl1(col), col] for col in data.data.columns.values]

In [14]:
list(zip(*premidx[:6]))

[('record_id',
  'samplenumber',
  'label_on_sample',
  'mrn',
  'biorepidnumber',
  'completed_by'),
 ('record_id',
  'samplenumber',
  'label_on_sample',
  'mrn',
  'biorepidnumber',
  'completed_by')]

In [15]:
midx = pd.MultiIndex.from_arrays(list(zip(*premidx)), names=['Groups','Columns'])

In [20]:
columns_orig = data.data.columns

In [21]:
data.data.columns = midx
data.data.head()

Groups,record_id,samplenumber,label_on_sample,mrn,biorepidnumber,completed_by,date_crf_completed,sample_date,registryconsent,biorepconsent,...,othermed,date_labs,hct,wbc,plt,crp,esr,alb,notes,ibd_biorepository_sample_database_complete
Columns,record_id,samplenumber,label_on_sample,mrn,biorepidnumber,completed_by,date_crf_completed,sample_date,registryconsent,biorepconsent,...,othermed,date_labs,hct,wbc,plt,crp,esr,alb,notes,ibd_biorepository_sample_database_complete
0,1,5231301000.0,1004,2127951,1004.0,,,2012-12-25,0.0,1.0,...,Lo ovral,,37.5,8.32,288.0,0.47,13.0,4.3,,2
1,2,8051400000.0,433.2-MX,2035198,433.2,,,2014-04-08,0.0,1.0,...,,,,,,,,,,2
2,3,8161400000.0,433.2-NX,2035198,433.2,,,2014-07-11,0.0,1.0,...,,,,,,,,,,2
3,4,4241300000.0,1,1042672,1.0,,,2013-02-20,1.0,1.0,...,INH,,31.4,2.83,233.0,0.09,19.0,4.3,,2
4,5,1151300000.0,2,2237636,2.0,,,2012-03-10,1.0,1.0,...,Ursodeoxycholic acid,,40.2,6.42,310.0,0.29,13.0,4.7,,2


In [22]:
df = data.data

In [23]:
t = df.T
t_gc = t.reset_index()[['Groups','Columns']]
t_gc['same'] = t_gc.Groups == t_gc.Columns
# t_gc.assign(new_groups=lambda df: 'base_lvl_group' if df.Groups == df.Columns else df.Groups)
t_gc['new_groups'] = t_gc.apply(func=lambda row: 'top_lvl_group' if row.same else row.Groups, axis=1)

t_gc = t_gc[['new_groups', 'Columns']].rename(columns={'new_groups': 'Groups'})
t_gc

Unnamed: 0,Groups,Columns
0,top_lvl_group,record_id
1,top_lvl_group,samplenumber
2,top_lvl_group,label_on_sample
3,top_lvl_group,mrn
4,top_lvl_group,biorepidnumber
5,top_lvl_group,completed_by
6,top_lvl_group,date_crf_completed
7,top_lvl_group,sample_date
8,top_lvl_group,registryconsent
9,top_lvl_group,biorepconsent


In [None]:
def label_common_group(df):
    t = df.T
    

In [None]:
data.data.T

In [None]:
def make_midx(df):
    pass
#      lvl1 = 

    

## family history stuff

In [40]:
data.data.columns = columns_orig

In [41]:
fam_hist = data.data.filter(regex='family')
fam_hist.head()

Unnamed: 0,family_history,familymember_ibd___1,familymember_ibd___2,familymember_ibd___3,familymember_ibd___4,familymember_ibd___5,familymember_ibd___6,familymembertype,familymembergidx,family_otherdiagnosis
0,0.0,0,0,0,0,0,0,,,
1,,0,0,0,0,0,0,1.0,1.0,
2,,0,0,0,0,0,0,1.0,1.0,
3,1.0,0,0,0,0,1,0,,,
4,0.0,0,0,0,0,0,0,,,


In [44]:
data.data[data.data.familymembergidx.notnull()][['ibdyesno', 'familymembergidx']]

Unnamed: 0,ibdyesno,familymembergidx
1,3.0,1.0
2,3.0,1.0
103,3.0,1.0
147,3.0,1.0
148,3.0,1.0
165,3.0,1.0
167,3.0,1.0
207,3.0,1.0
208,3.0,1.0
253,3.0,1.0


# Convert to HTML report and move to reports folder

In [None]:
# rv.reprt_dir

In [None]:
# !jupyter nbconvert --to html_toc $rv.ipnb

In [None]:
# !mkdir -p $rv.reprt_dir
## !mv -f $rv.html $rv.files $rv.reprt_dir