# Tutorial: Generation R metadata labelling 
Hi, welcome and thank you for helping us created a searchable data dictionary for Generation R. Make sure you have Python (version > 3.9.1) installed and the following packages: 
 - `pandas`
 - `string`

Ready to go :) 

In [1]:
import pandas as pd 
import numpy as np
import string
from typing import Union

# pd.options.display.max_colwidth = 100
pd.options.display.max_rows = None

In [2]:
# Specify which columns to read in and in which order 
useful_cols = ['var_name', # = variable name 
               'var_label', # = variable label (in English and containing the GR-code-section-number when applicable) 
               'var_type', # = variable type (numeric, factor, character) 
               'orig_file', # = File name(s) # '' = link(s) to wiki 
               'tot_n', # total numebr of observations in the file
               'missing', 
               'levels', 
               'descriptives'] # missing values in the variable

q = pd.read_csv("Extracted/Questionnaires_clean.csv", encoding='mac_roman', # index_col='var_name'
                usecols=useful_cols)[useful_cols] # [1:]

add_column = ['timepoint',     # = (median age of measurement) 
              'subject',       # = (subject)
              'n_observed',    #  = Number of observations ('n_total' - 'n_missing')
              'data_source',   # = (GR questionnaire, blood, urine, DXA scans, brain imaging …) 
              'gr_section',    # = (if applicable, GR questionnaire section (A-Z)
              'gr_qnumber',    # = (if applicable, GR number of the question)
              'reporter',      # = (if applicable, only for questionnaires: child, father, mother) 
              'var_comp',      # = variable computed (is it a score or the original item, for scores we need documentation) 
              'questionnaire', # (if applicable, only for validated questionnaires, name and abbreviaitons) 
              'questionnaire_ref', # (if validated questionnaire, reference it)
              'constructs']    # what constructs does this variable / questionnaire measure

for p in range(2,13):
    q.insert(loc=p, column=add_column[p-2], value=np.nan)

q.rename(columns = {'tot_n':'n_total', 'missing':'n_missing'}, inplace = True)

q.n_observed = q.n_total-q.n_missing

q.loc[q['var_label'] == np.nan] = " "
q.shape

(14335, 19)

In [3]:
# Extracted descriptives are a little hard to read let's simplify 
for i in range(len(q)):
    key = q.levels[i][3:-2].split("\", \"")
    key = [k.strip() for k in key]
    des = q.descriptives[i][2:-1].split(", ")
    des = [n.strip("\"") for n in des]
    des = [int(n) if n.isdigit() else round(float(n),2) if n not in ['character','','NA'] else n for n in des]
    dic = dict(zip(key,des[:-1]))
    q['descriptives'][i] = dic
q = q.drop('levels', axis=1)

In [4]:
# Fix file name error (repeated I dont know why...)
q.loc[q.orig_file.str.contains('GR1003-EMBU I1-I23_22112016; GR1003-EMBU I1-I23_22112016; GR1003-EMBU I1-I23_22112016; GR1003-EMBU I1-I23_22112016; GR1003-EMBU I1-I23_22112016; GR1003-EMBU I1-I23_22112016'),
                               'orig_file'] = 'GR1003-EMBU I1-I23_22112016'

In [5]:
print(list(q.columns))

['var_name', 'var_label', 'timepoint', 'subject', 'n_observed', 'data_source', 'gr_section', 'gr_qnumber', 'reporter', 'var_comp', 'questionnaire', 'questionnaire_ref', 'constructs', 'var_type', 'orig_file', 'n_total', 'n_missing', 'descriptives']


In [6]:
GR_ids = {'GR1001':['12-20 w','mother'], # The health of you and your child
          'GR1002':['15-23 w','mother'], # De voeding van moeder
          'GR1003':['20-25 w','mother'], # Feelings and memories
          'GR1004':['20-25 w','father'], # Health, lifestyle, background and feelings (partner)
          'GR1005':['30 w',   'mother'], # Living conditions
 # Postnatal (actual median age in months until 6y then in years)
          'GR1018':['2 m',  'mother', 2.8], # The first two months - your child
          'GR1019':['2 m',  'mother', 2.8], # The first two months - mother
          'GR1024':['6 m',  'mother', 6.3], # The first six months - mother
          'GR1025':['6 m',  'mother', 6.3], # The first six months - your child 
          'GR1028':['1 y',  'mother', 12.0],# My first year in Generation R
          'GR1060':['1 y',  'mother', 12.9],# De voeding van mijn kind - rond de 1e verjaardag
          'GR1029':['1.5 y','mother', 18.2],# My 1 / 1.5 year old toddler
          'GR1032':['2 y',  'mother', 24.2],# My todler
          'GR1064':['2 y',  'mother', 24.9],# De voeding van mijn kind – rond de 2e verjaardag
          'GR1062':['2.5 y','mother', 30.7],# My toddler’s development
          'GR1065':['3 y',  'mother', 36.2],# My three-year-old child
          'GR1066':['3 y',  'father', 36.4],# My three-year-old child (partner)
          'GR1067':['4 y',  'mother', 48.3],# My 4-year old child
          'GR1075':['5 y',  'mother', 71.4],# The development of my 5/6-year old child – Part I
          'GR1076':['5 y',  'mother', 72.6],# The development of my 5/6-year old child – Part II
          'GR1079':['6 y',  'teacher',74.9],# Gedragsvragenlijst voor kinderen t/m 18 jaar
          'GR1080':['8 y',  'mother', 8.07],# Diet and behavior
          'GR1081':['9 y',  'mother', 9.67],# Development of my 9/10 year old child - Part 1
          'GR1082':['9 y',  'mother', 9.79],# Development of my 9/10 year old child - Part 2
          'GR1083':['9 y',  'father', 9.74],# Development of my 9/10-year old child - Partner
          'GR1084':['9 y',  'child',  9.78],# Mijn eerste vragenlijst
          'GR1093':['13 y', 'mother',13.49],# My teenager part 1
          'GR1094':['13 y', 'mother',13.58],# My teenager part 2
          'GR1095':['13 y', 'child', 13.52],# Mijn vragenlijst – Deel 1
          'GR1096':['13 y', 'child', 13.71],# Mijn vragenlijst – Deel 2
          'GR1097':['13 y', 'mother',13.54]}# ? about ENT specialist visit ?

# Not included:
# GR1078 ? Cognition
# GR1086 ??
# GR1031_0y?_De gezondheid van u en uw partner (partner) - included at birth?
# GR1073_3y?_Mijn kind op de basisschool                 - cohort South?
# GR1074_3y?_Mijn kind op de basisschool (partner)       - cohort South?
# GR1069_4y?_General information mother                  - used complete missing data
# GR1070_4y?_General information partner                 - used complete missing data
# GR1071, 72, 77: Child Health Care data 2005, 06, 07    - children living outside Rotterdam
# Unexpected symtoms: GR1027 (2m), GR1053 (6m), GR1058 (1y), GR1059 (2y), GR1061 (3y), GR1068 (4y)

# GR1075 and 76: we call it 5 y but it is actually 6, GR1081 to 84 call it 9 but actually 9.7 y

# Combined: GR1065 and 66, 75 and 76, 81  82 and 83, 93 and 96

In [7]:
# Construct the main assignment funtion. It takes only one obligatory argument: selected. 
def assign(selected: Union[str, list], # var_type, n_observed, orig_file, n_total, n_missing, descriptives
           based_on: str = 'orig_file', 
           case_sensy = False,
           sel_type = 'contains', # 'ends', 'starts', 'is'
           and_also: tuple = None, 
           verbose = True,
           print_labels = False,
           # assignment arguments 
           data_source: str = None, 
           timepoint: Union[str, list] = None, 
           reporter: Union[str, list] = None, 
           var_label: Union[str, list] = None, 
           subject: Union[str, list] = None, 
           gr_section: Union[str, list] = None, 
           gr_qnumber: Union[str, list] = None, 
           var_comp: Union[str, list] = None, 
           questionnaire: Union[str, list] = None, 
           questionnaire_ref: Union[str, list] = None, 
           constructs: Union[str, list] = None):
    
    # SELECTION --------------------------------------------------
    if based_on not in q.columns:
        print('There is no column called \"',based_on,'\"', sep='')
        return None
    
    selection = '|'.join(selected) if isinstance(selected, list) else selected # define the selection
    
    if sel_type == 'contains': # fix using regex i think
        sel = q[based_on].str.contains(selection, na=False, case=case_sensy) # perform the selection
    elif sel_type == 'starts':
        sel = q[based_on].str.startswith(selection, na=False) # perform the selection
    elif sel_type == 'ends':
        sel = q[based_on].str.endswith(selection, na=False) # perform the selection
    elif sel_type == 'is':
        sel = q[based_on] == selection # perform the selection
        
    if and_also: # additional constraints
        sel = (sel & q[and_also[0]].str.contains(and_also[1], na=False, case=case_sensy))
    
    data = q.loc[sel,]
    
    if len(data) < 1 :
        if verbose:
            print('Your selection (',selection,') resulted in 0 rows!',sep='')
        return None
    else:
        if verbose:
            print(len(data), 'rows selected.')
        if print_labels:
            print(list(q.loc[sel, 'var_label']))
    
    # ASSIGNMENT -------------------------------------------------
    if data_source:
        if data_source in GR_ids.keys():
            q.loc[sel, ['data_source', 'timepoint', 'reporter']] = data_source, GR_ids[data_source][0], GR_ids[data_source][1]
        else:
            q.loc[sel, 'data_source'] = data_source
    
    def match_length(arg, name):
        if isinstance(arg,list) and len(arg)!=len(data):
            print('The number of rows (',len(data),') and assigned values (',len(arg),') of ',name,' do not match',sep='')
            return False
        else:
            return True
    
    if timepoint:
        if not match_length(timepoint,'timepoint'):
            return None
        q.loc[sel, 'timepoint'] = timepoint        
    
    if reporter:
        if not match_length(reporter,'reporter'):
            return None
        q.loc[sel, 'reporter'] = reporter
    
    if gr_section:
        if not match_length(gr_section,'gr_section'):
            return None
        q.loc[sel, 'gr_section'] = gr_section
    
    if gr_qnumber:
        if not match_length(gr_qnumber,'gr_qnumber'):
            return None
        q.loc[sel, 'gr_qnumber'] = gr_qnumber
    
    if subject:
        if not match_length(subject,'subject'):
            return None
        q.loc[sel, 'subject'] = subject
    
    if var_label:
        if not match_length(var_label,'var_label'):
            return None
        q.loc[sel, 'var_label'] = var_label
        
    if var_comp:
        if not match_length(var_comp,'var_comp'):
            return None
        q.loc[sel, 'var_comp'] = var_comp
     
    if questionnaire:
        if not match_length(questionnaire,'questionnaire'):
            return None
        q.loc[sel, 'questionnaire'] = questionnaire
         
    if questionnaire_ref:
        if not match_length(questionnaire_ref,'questionnaire_ref'):
            return None
        q.loc[sel, 'questionnaire_ref'] = questionnaire_ref
         
    if constructs:
        if not match_length(constructs,'constructs'):
            return None
        q.loc[sel, 'constructs'] = constructs
    
    # Do not assign specific sources to ID variables 
    ids = ['IDM', 'idm', 'MOTHER', 'IDC']
   
    for id_var in ids:
        q.loc[q['var_name'] == id_var, 'var_comp'] = 'ID'
        q.loc[q['var_name'] == id_var, ['timepoint','data_source','gr_section','gr_qnumber','reporter',
                                        'questionnaire','questionnaire_ref', 'constructs']] = ' '
        sbj = 'child' if id_var =='IDC' else 'mother'
        q.loc[q['var_name'] == id_var, 'var_comp'] = sbj
        
    other = ['GENDER','FUPFASE2','ETHNM','INTAKE','intake'] # add age 
    for other_var in other:
        q.loc[q['var_name'] == other_var, ['timepoint','data_source','gr_section','gr_qnumber','reporter',
                                        'questionnaire','questionnaire_ref', 'constructs','var_comp']] = ' '
        sbj = 'child' if other_var in ['GENDER','FUPFASE2'] else 'mother'
        q.loc[q['var_name'] == other_var, 'var_comp'] = sbj
    
    show = q.loc[sel,] # .set_index('var_name')
    return(show)


In [8]:
def list_numbers(start, end, lvl1=None, lvl2=None):
    main = [str(i) for i in range(start,end+1)]
    if lvl1:
        sub1 = list(string.ascii_lowercase)[:lvl1]
        numb =[]
        for m in main:
            for s in sub1:
                numb.append(m+"."+s)
        
        if lvl2:
            sub2 = [str(i) for i in range(1,lvl2+1)]
            numb2 =[]
            for m in numb:
                for s in sub2:
                    numb2.append(m+"."+s)
            return(numb2)
        else:
            return(numb)
    else:
        return(main)

# def list_qnumbers(qr, section, dec=1):
#     o = q.loc[(q['data_source'] == qr)&(q['gr_section'] == section)]
#     main = [str(int(x)) for x in o.index.str[1:3]]
#     subn = [str(x) for x in o.index.str[6-dec:6]]
#     numb =[]
#     for (m, s) in zip(main, subn):
#         numb.append(m+"."+s)
#     print(numb)

# list_qnumbers('GR1002', 'C', 1)

# Tutorial
## Selection 
Choose a selection criteria using **`selected`**: this is the first argument of the `assign()` funtion. It can be a string (e.g., `'something'`) or a list of strings (e.g., `['something', 'something else']`).

You should use selected in combination with **`based_on`**, the second argument. This tells the function what column in the dataset you want to specify your selection on. Default is `'orig_file'`, which holds the names of the original .sav files. You can change this to any column in the dataset, for example use `based_on='var_name'` for selecting based on variable names, or `based_on='var_label'` if you want to use variable labels. At the moment the function only supports a *single value* of based_on, but do let me know if you need more flexibility. 

When a single selection criterium is not enough, for example if you want to select not only questionnare GR1001 but also section A (`data_source=='GR1001'` **and** `gr_section=='A'`), use the argument **`and_also`**. This is a tuple (_"a what?"_ - a kind of list with two elements in it) where the first element is the column name you want to additionally select on and the second element is the value(s) that you want it to take. Both the first and second elements should be a string. <ins>Trick</ins>: if you want to set the condition a bit more flexibly, for example select section `A` **or** `B` you can put a `|` in between the values, like this: `and_also=('gr_section','A|B')`. You can do this as many times as you want. 

By default, the selection is <ins>**not**</ins> **case sensitive**, but you can change that by setting the argument **`case_sensy = True`**.

Additionally, by default the function will search for any row _containing_ the strings you entered in `selected`. You may want to select rows that **start / end with** or correspond exactly to the string you entered, and you can do that by using the argument **`sel_type`**. This can take the following values: `'contains'` (default), `'ends'`, `'starts'` and `'is'`. <ins>Note</ins>: besides the default, contains, all other options are _case sensitive_.

If you want just check a selection criterium, without assigning any values, simply run `assign()` without any other arguments. For example:

In [9]:
# assign('GR1001', 'var_label') #,  sel_type = 'ends')#  # 'ends', 'starts', 'is') # and_also =('var_label','age'),

### <ins>Tip time</ins>: regular expressions (`regex` syntax)
If nose around the assign function you may notice how it uses regular espression to make the search more flexible. These are small symbols that can are used to establish rules or regularities. In other words you not only can search for something _containing_ 'feed' but also for example everything **ending** with **beginning** 'feed'. It gets better, you can say select something _ending with any number_ or _any capital letter followed by two numbers_ and so on... pretty cool yes. 

Here are some useful basic commands that you can leverage:
   - '^_' means "begins with". For example '^Bre' will get you everything starting with 'Bre'. 
   - '_\\$' means "ends with". For example '\\$' will give everything ending with '23'.

In [10]:
# assign('05$', 'var_label')

As usual, if soemthing is not working quite right or you need more flexibility in selection, let me know. but for now let's move on to assigning some values.

## Assignment
You can change the value of any column you like for the rows you have selected. Use the following arguments (one for each column that needs manual assigniments): 

**Note**: most of these three columns are <ins>already assigned</ins> automatically but please correct them if you spot errors or missing values.
* **`data_source`**: this specifies the *source* of the variable. <br> For Questionnaire data this will be either a GR-id or a "interview". If you assign a GR-id that is included in the `GR_ids` dictionary defined above, the function will automatically assign also `timepoint` and `reporter`. If the data source is new, however, please specify those manually. <br> For variables that are _scores_ combining multiple data sources, I tipically use the format `'GR1001-03'`.

    For measurements and other data, `data_source` can take values such as e.g., blood, urine, DXA scan, brain MRI ... (<font color='red'>*specify standard*</font>). 


* **`timepoint`**: when whas the value measured in child age. This is expressed in *weeks gestation* (`'w'`) for prenatal variables, in *months* (`'m'`) during the first year of life and in *years* (`'y'`) for all subsequent measures. There is a space in between the number and the time unit, see for instance the values specified in `GR_ids`. <font color='red'>*Note that this is not always reflecting the median age of the measurement.*</font>


* **`reporter`**: the person that completed the questionnaire / interview. It can take the following values: `'mother'`,`'father'`,`'child'`,`'teacher'`, `'mother & father'` (for combined scores). If not applicable, please set this value to `' '`.

> **Tip**: in python arguments are *positional* so you don't have to specify their name if you input them in the correct order. So for example you can simply specify the data source as the fifth argument. But of course spelling it out makes it clearer.

In [11]:
# assign('BREAST', 'var_label', timepoint=['1 y']*70+['i']*7)

These are the values that, more commonly will need to be assigned: 

* **`gr_section`**: this indicates the section on the questionnaire (`A` to `K`). It is a single capital letter. I would suggest starting from this column when you work on completing your table, so you can use them in the selection and assignment of other metadata. 


* **`gr_qnumber`**: indicates the number of the question / item. Note that this is also a _string_. This is sometimes indicated by the variable name or label, but I find it best to always check the questionnaire PDFs to make sure the number assigned corresponds. <br> Some questions have a more complex, nested structure with two or more levels. We encode this as follows: the first level is number and that is normally explicitly indicated on the PDF. For the second level we will use letters. Additional levels get numbers again. Levels are separated by a dot. <br> For example: 

    You don't need to type this yourself every time, you can use the `list_numbers()` function instead. This takes two obligatory arguments, `start` and `end` which you can use to indicate the range of numbers. So for instance `list_numbers(1,5)` will output `['1', '2', '3', '4', '5']`. You can then add sublevels using the arguments `lvl1` and `lvl2`. For example, `list_numbers(2, 3, lvl1=2)` will give `['2.a', '2.b', '3.a', '3.b']` and `list_numbers(2, 3, lvl1=1, lvl2=2)` will give `['2.a.1', '2.a.2', '3.a.1', '3.a.2']`.


* **`var_label`**: variable description. This is an important part of the dictionary and a terribly messy one too. Some labels are not there (~15%), some are in Dutch, some are just not understandable (e.g., they use acromyms that are not spelled out or they are just copies of variable names). PLEASE HELP US FIX THIS. 
    - For **items**, the label would normally correspond to the (complete) question that was asked, in English, as you can read it in the PDFs. If this is part of a nested question please specify the full question so that it is understandable on its own. 
    - For **scores**, please use the best description you can find, including both the full name of a score or measure and its acromym (when applicable) and measurement unit (when applicable). Example of the format: `'Body mass index (BMI), kg/m2'`. 
    Again, you don't need to do this one by one, you can use lists. For instance you can set the argument `print_labels = True` in the assign function to have a list of the variable labels in your selection. You can copy paste this, edit and assign it. 


* **`subject`**: who is the information about? It can take the following values: `'mother'`,`'father'`,`'child'`, `'family'`. If not applicable, please set this value to `' '`. Normally this is value can be assigned by section (look at the PDFs to quickly understand who the section is about). If you are unsure, feel free to ask me. 


* **`var_comp`**: this indicates if the variable is an `'item'` (i.e., a single question that was directly answered) or a `'score'` (i.e., a combination of items, for example a subscale total score). Other possible values include `'ID'` (i.e., for example 'IDC') or '`metadata'` (for example, child age).


* **`questionnaire`** and **`questionnaire_ref`**: some groups of items correspond to validates questionnaires / interviews. You can find most of these in the document 'Questionnaires Generation R with refs 2021', the 'GenR measumeremnt overview' and 'datataxonomy' spreadsheets or in the 'referencesquestionnairesgenr' PDFs. Please specify `questionnaire` as the full name of the instrument and its acronym at the end between brackets. `questionnaire_ref` should be a link to the instrument reference or manual. Preferably, this is the DOI (preceded by 'https://doi.org/'). When this is not available you can use another link to the reference, or to a Generation R paper that describes the instrument. 


* **`constructs`**: what is this variable measuring? To help make the search more flexible, we also want to tag varaiables with the concepts they are supposet to tap into. For example maternal smoking variables could carry the tags 'smoking','tabacco','cigarettes'. Please include 1-3 terms that you believe apply, thi can be also general e.g. 'psychopatology'. Separate terms with ';'.


* **other stuff**: `n_observed`,`var_type`,`orig_file`,`n_total`, `n_missing`, `desctiptives`.

In [12]:
# Assign questionnaire ID based on file names -------------

# common parameters
b = 'orig_file' # this section will be all based on original file names
v = False # set verbose argument to false 

# First take advantage of the GR_ids dictionary. 
for quest in GR_ids.keys():
    assign(quest, based_on='orig_file', data_source = quest, verbose=False);

# Other files do not contain the GR-id in the file names but when the variable name 
# contains the GR-id or the format letter (section) + 7 digits, the last two digits
# are used to attribute GR-id.
for n in range(1,100):
    i = str(n).rjust(2, '0') # 01,02,03...99
    assign(['^[A-Za-z]\d{5}'+i, 'GR10'+i], based_on='var_name',
           data_source = 'GR10'+i, verbose=False)

# Attention: some exeptions to this rule (error in variable names)
assign('g0104101', based_on='var_name', data_source = 'GR1004', verbose=False) 
assign(['i1500402','i1600402'], based_on='var_name', data_source = 'GR1001', verbose=False) 
# Note GR1001 H11-1 varaible label is wrong (should be GR1003)

# PREGNANCY ----------------------------------------------------------
assign(['FISHEPICCLASSIFICATION','MATERNALNUTRITION','MOTHERDIETARYPATTERNS','MOTHERDietScore_pregnancy', 'MOTHERDIET_rMEDandrMED2_N6400'],
       based_on='orig_file', verbose=False,
       data_source = 'GR1002',
       subject = 'mother',
       questionnaire = 'Food Frequency Questionnaire (FFQ)', 
       questionnaire_ref = 'https://doi.org/10.1038/sj.ejcn.1600611', 
       constructs = 'diet; nutrition');

# Medication
d = {'TOT':['GR1001-03-05','12-30 w',' ',' '],
     'Q01':['GR1001','12-20 w','E','3'],
     'Q02':['GR1003','20-25 w','B','7'],
     'Q03':['GR1005','30 w',   'C','7']}
for var in d.keys():
    assign(var, based_on='var_name', and_also=('orig_file','MEDICATIONSELFREPORTPREGNANCY'),
           data_source = d[var][0], timepoint = d[var][1], verbose=False,
           reporter = 'mother', subject = 'mother', gr_section = d[var][2], gr_qnumber = d[var][3], 
           var_comp = 'score', questionnaire = ' ', questionnaire_ref = ' ', 
           constructs = 'medication; pharmacy')
# Caffeine intake 
d = {'caf$':['GR1001-03-05','12-30 w',' ',' '],
     'caf1':['GR1001','12-20 w','F','2'],
     'caf2':['GR1003','20-25 w','B','1'],
     'caf3':['GR1005','30 w',   'C','1']}
for var in d.keys():
    assign(var, based_on='var_name', and_also=('orig_file','MATERNALCAFFEINEINTAKE'),
           data_source = d[var][0], timepoint = d[var][1], verbose=False,
           reporter = 'mother', subject = 'mother', gr_section = d[var][2], gr_qnumber = d[var][3], 
           var_comp = 'score', questionnaire = ' ', questionnaire_ref = ' ', 
           constructs = 'coffee; caffeine')
# Smoking 
d = {'smk1':['GR1001','12-20 w','F','8'],
     'smk2':['GR1003','20-25 w','B','4'],
     'smk3':['GR1005','30 w',   'C','4'],
     'SMOKE|postnatal':['GR1001-03-05','12-30 w',' ',' ']}   
for var in d.keys():
    assign(var, based_on='var_name', and_also=('orig_file','MATERNALSMOKING'),
       data_source = d[var][0], timepoint = d[var][1], verbose=False,
       reporter = 'mother', subject = 'mother', gr_section = d[var][2], gr_qnumber = d[var][3], 
       var_comp = 'score', questionnaire = ' ', questionnaire_ref = ' ', 
       constructs = 'smoking; tabacco; cigarettes')
# Alcohol 
d = {'alc1$|alc1_':      ['GR1001','12-20 w','F','6'],
     'alc2$|alc2_|2trim':['GR1003','20-25 w','B','3'],
     'alc3':             ['GR1005','30 w',   'C','3'],
     'mdrink|drimis|alc0|alc12|alc13|alc23':['GR1001-03-05','12-30 w',' ',' ']}   
for var in d.keys():
    assign(var, based_on='var_name', verbose=False,
       data_source = d[var][0], timepoint = d[var][1], 
       reporter = 'mother', subject = 'mother', gr_section = d[var][2], gr_qnumber = d[var][3], 
       var_comp = 'score', questionnaire = ' ', questionnaire_ref = ' ', 
       constructs = 'alcohol; drinking')

# HOME INTERVIEW 30w -------------------------------------------------
assign('expressed emotions Focus 30 weken', based_on=b, verbose=False,
       data_source = 'home-interview',
       timepoint = '30 w',
       reporter = 'mother', 
       subject = 'mother',
       gr_section = ' ', 
       gr_qnumber = ' ', 
       var_comp = 'score')
       # questionnaire = '###TODO###', 
       # questionnaire_ref = '###TODO###', 
       # constructs = '###TODO###'
# but some variables refer to the partner (they end with P)
assign('P', based_on='var_name', sel_type='ends', verbose=False,
       and_also=(b, 'expressed emotions Focus 30 weken'),
       reporter = 'father', 
       subject = 'father')

assign('Focus Cidi - consent fase 2', based_on=b, verbose=False,
       data_source = 'home-interview',
       timepoint = '30 w', 
       reporter = 'mother', 
       subject = 'mother', 
       gr_section = ' ', 
       gr_qnumber = ' ', 
       questionnaire = 'Composite International Diagnostic Interview (CIDI)', 
       questionnaire_ref = 'https://doi.org/10.1080/14616734.2012.636659', 
       constructs = 'Psychopathology')
# but some variables refer to the partner (they start with P)
assign('P', based_on='var_name', sel_type='starts', verbose=False,
       and_also=(b, 'Focus Cidi - consent fase 2'),
       reporter = 'father', 
       subject = 'father')

# Folic acid, perconception
assign('MATERNALFOLICACID', based_on='orig_file', verbose=False,
       data_source='GR1001-19', timepoint='12 w - 2 m', reporter='mother', subject='mother',
       gr_section = ' ', gr_qnumber = ' ', var_comp='score', questionnaire = ' ', questionnaire_ref = ' ', 
       constructs = 'folic acid')

# 2 m -----------------------------------------------------------------
assign('Dur_bedbringing_D9_1018', based_on='var_name', verbose=False,
       data_source = 'GR1018', gr_section = 'D', gr_qnumber = '9');

# Breastfeeding
assign('CHILDBREASTFEEDING_', based_on='orig_file', verbose=False, data_source = 'GR1025')
assign('CHILDBREASTFEEDING_', based_on='orig_file', and_also=('var_name','2M$'), verbose=False, 
       data_source = 'GR1018') # exeption

# 1 y -----------------------------------------------------------------
assign(['CHILDNUTRITIONONEYEAR', 'CHILDDietScore_1y'], based_on='orig_file', verbose=False, # FFQ
       data_source ='GR1060',
       subject = 'child',
       questionnaire = 'Food Frequency Questionnaire (FFQ)', 
       questionnaire_ref = 'https://doi.org/10.3945/jn.114.199349', 
       constructs = 'diet; nutrition')

# 2 y -----------------------------------------------------------------
assign('Parca_en_LDS_dataset_intern_gebruik', based_on='orig_file', verbose=False,
       data_source ='GR1062',
       subject='child'); #### quest: PARCA (cognition) and LDS (language) (2.5 y)

assign(['CHILDNUTRITIONTWOYEARS','CHILDDietScore_2y', 'FFQ_2y'], based_on='orig_file', verbose=False,
       data_source ='GR1064', # GR1064 not on overview word document
       subject='child',
       questionnaire = 'Food Frequency Questionnaire (FFQ)', 
       questionnaire_ref = 'https://doi.org/10.3945/jn.114.199349', 
       constructs = 'diet; nutrition')

assign('SLEEPINGCHILD2YEARS', based_on='orig_file',  verbose=False,
       data_source = 'GR1032') # check and fix
assign('HrsSleepNight_24', based_on='var_name', verbose=False,
       data_source = 'GR1032', gr_section = 'F', gr_qnumber = '3')
# assign('Dur_bedbringing_F83_1030', 'var_name') 1030????

# 3 y -----------------------------------------------------------------
# GR1065/66 # CBCL
assign(r'_36m$', based_on='var_name', data_source = 'GR1065', verbose=False) # ends with "_36m"
assign(['mother or father', 'mother and partner'], based_on='var_label', verbose=False,
      data_source = 'GR1065-66',
      timepoint = '3 y',
      reporter = 'mother & father', 
      var_comp = 'score')

assign('_1065', based_on='var_name', data_source = 'GR1065', verbose=False) # SLEEPINGCHILD0-3YEARS

# 4 y -----------------------------------------------------------------
assign('BRIEF - 48 months', based_on='orig_file', verbose=False,
        data_source ='GR1067',
        subject = 'child',
        questionnaire = 'Brief Rating Inventory of Executive Function-Preschool Version (BRIEF-P)', 
        questionnaire_ref = 'https://doi.org/10.1080/09297041003679344', 
        constructs = 'cognition; executive function'); 

# 5 y -----------------------------------------------------------------
assign(['PARENTEMPLOYMENT5', 'CHILDCBCL5', 'CHILDCBCL_6_incl_Tscores'], based_on='orig_file',
        data_source ='GR1075', verbose=False,)

# 6 y -----------------------------------------------------------------
assign(['trf','Schoolyear', 'Class_clean_final','Class_corrected', 'location_cluster', 'class_cluster',
       'teacher_cluster', 'school_cluster', 'Withdrawn__Depressed_TScore','Social_Problems_TScore', 
        'Thought_Problems_TScore','Rule_Breaking_Behavior_TScore', 'Somatic_Problems_TScore',
       'Conduct_Problems_TScore','Hyperactivity_Impulsivity_Problems_Subscales_Percentile',
       'Sluggish_Cognitive_Tempo_TScore','Obsessive_Compulsive_Problems_TScore'], based_on='var_name', verbose=False,
       data_source = 'GR1078-79', timepoint = '6 y', reporter='teacher', subject='child')
# GR1079/78?? # Questionnaire Community Health Service 1 (5.5 years) to be filled in by TEACHER, grade 2 of preschool
assign(r'[A-Za-z]\d{5}78', based_on='var_label', data_source = 'GR1078', verbose=False,
       timepoint = '6 y', 
       reporter = ' ', 
       subject = 'child', 
       questionnaire = 'Snijders-Oomen Niet-verbale intelligentie test 2.5-7 jaar revisie (SON-R 2.5-7)', 
       questionnaire_ref = 'http://www.testresearch.nl/sonr/sonr257manual.pdf', 
       constructs = 'cognition')

assign('CHILDDISC_IV_YC', based_on=b, verbose=False,
        data_source = 'interview',
        timepoint = '6 y',
        reporter = 'child', 
        subject = 'child', 
        gr_section = ' ', 
        gr_qnumber = ' ', 
        constructs = 'Psychopathology',
        questionnaire = 'Diagnostic Interview Schedule-Young Child version (DISC-YC)',
        questionnaire_ref = 'https://www.cdc.gov/nchs/data/nhanes/limited_access/interviewer_manual.pdf')

assign('SCQ-interviews', based_on=b, verbose=False,
        data_source = 'interview',
        timepoint = '6 y',
        reporter = 'mother', 
        subject = 'child', 
        gr_section = ' ', 
        gr_qnumber = ' ', 
        constructs = 'Autism spectrum disorder (ASD)',
        questionnaire = 'Social Communication Questionnaire (SDQ)',
        questionnaire_ref = 'https://www.nji.nl/instrumenten/social-communication-questionnaire-scq')

# 8 y -----------------------------------------------------------------
assign(['CHILDNUTRITION_', 'CHILDNUTRITIONFOODGROUPS', 'CHILDNUTRITION8', 'CHILDDietScore_8y'], 
       based_on='orig_file', verbose=False, # FFQ
       data_source ='GR1080',
       subject='child',
       questionnaire = 'Food Frequency Questionnaire (FFQ)', 
       questionnaire_ref = 'https://doi.org/10.1007/s00394-018-1651-z', 
       constructs = 'diet; nutrition'); 

# 9 y -----------------------------------------------------------------
# GR1081/83 # BSI
assign(r'm$', based_on='var_name', and_also=('orig_file','GR1081-GR1083'), data_source = 'GR1081', verbose=False) # ends with "m"
assign(r'p$', based_on='var_name', and_also=('orig_file','GR1081-GR1083'), data_source = 'GR1083', verbose=False) # ends with "p"
# GR1081/83 # CBCL
assign(r'9m$', based_on='var_name', and_also=('orig_file','CHILDCBCL9'), data_source = 'GR1081', verbose=False)
assign(r'9p$', based_on='var_name', and_also=('orig_file','CHILDCBCL9'), data_source = 'GR1083', verbose=False)
# ????
assign('GR1086', based_on='var_label', data_source = 'GR1086', verbose=False)

# 13 y ----------------------------------------------------------------
assign('CHILDRISKBEHAVIOUR13', based_on='orig_file', 
       data_source = 'interview',
        timepoint = '13 y',
        reporter = 'child', 
        subject = 'child', 
        gr_section = ' ', 
        gr_qnumber = ' ', 
        constructs = 'substance use, tobacco, alcohol, cannabis; rule-breaking; sexual')

# COVID ---------------------------------------------------------------
assign('Covid', based_on=b, verbose=False,
        data_source = 'COVID',
        timepoint = '17 y',
        reporter = 'child', 
        subject = 'child');

# ASSIGN SECTIONS ---------------------------------------------------------------
# identify the variable names espressed in standard way and assign Section letter
assign(r'\b[A-Za-z]\d{7,7}', based_on='var_name', verbose=False,
       gr_section = list(q.var_name[q.var_name.str.contains(r'\b[A-Za-z]\d{7,7}')].str[0].str.upper()),
       var_comp = 'item');


28 rows selected.


In [13]:
assign('AgeMfirstsexcontact', based_on='var_name', verbose=False,
          gr_section='D',
          gr_qnumber='4',
          var_comp='item')

assign(['drbef1m','drbef2m','drbef3m','drbef4m','drbef5m','drbef6m', # yes or no (see other var for frequency)
        'draft1m','draft2m','draft3m','draft4m','draft5m','draft6m'], based_on='var_name', verbose=False,
          gr_section='F',
          gr_qnumber=['11.1.1', '11.1.2', '11.1.3', '11.1.4', '11.1.5', '11.1.6',
                      '12.1.1', '12.1.2', '12.1.3', '12.1.4', '12.1.5', '12.1.6'],
          var_comp='item')

assign(['drfath1m','drfath2m','drfath3m','drfath4m','drfath5m','drfath6m', 
        'g0602_1','g0602_2','g0602_3','g0602_4','g0602_5','g0602_6'], based_on='var_name', verbose=False,
          gr_section='G',
          gr_qnumber=['3.1.1', '3.1.2', '3.1.3', '3.1.4', '3.1.5', '3.1.6', 
                       '6.1.1', '6.1.2', '6.1.3', '6.1.4', '6.1.5', '6.1.6'],
          var_comp='item')

assign('int_qsm', based_on='var_name', verbose=False,
          gr_section='F', # interval in years between quiting smoking and lmp (use mat smoking)
          gr_qnumber='7.2-7.3',
          var_comp='score')

assign(['lft_nl','lft_nl15','generama'], based_on='var_name', verbose=False,
          gr_section='J', # age move to the NL, before 15, generaiton
          gr_qnumber='5-6',
          var_comp='score')

assign(['samen_1$','samen_2$','samen_3$','samen_4$','samen_5$','samen_6$','samen_7$','samen_8$','samen_9$'], 
         based_on='var_name', verbose=False,
         gr_section = 'J',
         gr_qnumber = ['17.1', '17.2', '17.3', '17.4', '17.5', '17.6', '17.7', '17.8', '17.9'],
         var_comp = 'item');
    
assign('GR1001', based_on='data_source', subject='mother', verbose=False);
assign('GR1001', based_on='data_source', and_also=('gr_section','G'), subject='father',verbose=False);

In [14]:
# Simple general subject tag
assign(['MATERNAL', 'MOTHER'], based_on=b, subject = 'mother', verbose=False)
assign('CHILD', based_on=b, subject = 'child', verbose=False)

assign('mother', based_on='var_name', subject = 'mother', verbose=False)
assign('father', based_on='var_name', subject = 'father', verbose=False)

# CBCL 3 y ----------------------------------------------
assign('CBCL_3_incl_Tscores__GR1065E2_GR1066A1_20201111', based_on=b, verbose=False,
      subject = 'child');

In [15]:
'''
assign(_, based_on = 'orig_file', case_sensy=False,
       and_also = None, 
       data_source = ' ', 
       timepoint = ' ', 
       reporter = ' ', 
       var_label = ' ', 
       subject = ' ', 
       gr_section = ' ', 
       gr_qnumber = ' ', 
       var_comp = ' ', 
       questionnaire = ' ', 
       questionnaire_ref = ' ', 
       constructs = ' ',
       focus_cohort = 'no',
       verbose=True)
'''

"\nassign(_, based_on = 'orig_file', case_sensy=False,\n       and_also = None, \n       data_source = ' ', \n       timepoint = ' ', \n       reporter = ' ', \n       var_label = ' ', \n       subject = ' ', \n       gr_section = ' ', \n       gr_qnumber = ' ', \n       var_comp = ' ', \n       questionnaire = ' ', \n       questionnaire_ref = ' ', \n       constructs = ' ',\n       focus_cohort = 'no',\n       verbose=True)\n"

In [16]:
lefts = q.loc[q.data_source.isna(), 'orig_file']
print(len(lefts), 'rows left with unspecified data source,', round((len(lefts)/len(q))*100, 2), '% \n')
left = lefts.unique() # 23% of the values are still not assigned
# out.to_csv('aa.csv')
for it in left:
    print(len(q.loc[(q.orig_file== it) &(q.data_source.isna()),]), '\t', it)


90 rows left with unspecified data source, 0.63 % 

2 	 20210302_beroep partner_SBC92
40 	 ASTHMA-RESPIRATORY INFECTIONS-ATOPY_22112016
4 	 CHILDADHDMEDICATION5_17112016
2 	 CHILDADHDMEDICATION9_04042017; CHILDADHDMEDICATION9_17112016; CHILDBehavioralHealthcareUse5-9y_20122018
1 	 CHILDADHDMEDICATION9_04042017; CHILDADHDMEDICATION9_17112016
7 	 CHILDADHDMEDICATION9_04042017
1 	 CHILDADHDMEDICATION9_17112016
9 	 CHILDASTHMA-RTI-ATOPY5_12112012
2 	 CHILDBehavioralHealthcareUse5-9y_20122018; CHILDTRF_incl_Tscores_20201111
1 	 CHILDBehavioralHealthcareUse5-9y_20122018
2 	 MATERNALCOMPLICATIONS_22112016; MATERNALPE_12122019
5 	 MATERNALCOMPLICATIONS_22112016
7 	 MEDICATIONPREGNANCY_BENZIODIAZEPINES_30112017
7 	 PRESCHOOLWHEEZINGPATTERNS_22112016


In [17]:
for var in q.columns:
    left = q.loc[q[var].isna(),'orig_file']
    print(len(left), '\trows left with unspecified', var,'\t', round((len(left)/len(q))*100, 1), '%')


0 	rows left with unspecified var_name 	 0.0 %
2225 	rows left with unspecified var_label 	 15.5 %
108 	rows left with unspecified timepoint 	 0.8 %
7566 	rows left with unspecified subject 	 52.8 %
0 	rows left with unspecified n_observed 	 0.0 %
90 	rows left with unspecified data_source 	 0.6 %
7026 	rows left with unspecified gr_section 	 49.0 %
12655 	rows left with unspecified gr_qnumber 	 88.3 %
108 	rows left with unspecified reporter 	 0.8 %
8301 	rows left with unspecified var_comp 	 57.9 %
11899 	rows left with unspecified questionnaire 	 83.0 %
11899 	rows left with unspecified questionnaire_ref 	 83.0 %
11872 	rows left with unspecified constructs 	 82.8 %
0 	rows left with unspecified var_type 	 0.0 %
0 	rows left with unspecified orig_file 	 0.0 %
0 	rows left with unspecified n_total 	 0.0 %
0 	rows left with unspecified n_missing 	 0.0 %
0 	rows left with unspecified descriptives 	 0.0 %


In [18]:
print(q.var_type.unique())
print(q.timepoint.unique())
print(q.subject.unique())
print(q.data_source.unique())
print(q.gr_section.unique())
print(q.reporter.unique())
print(q.var_comp.unique())

['numeric' 'factor' 'character']
[' ' '30 w' '2 m' '9 y' '1.5 y' nan '4 y' '3 y' '17 y' '5 y' '6 y' '1 y'
 '6 m' '2 y' '8 y' '13 y' '15-23 w' '12-30 w' '12-20 w' '20-25 w' '2.5 y'
 '12 w - 2 m']
['child' 'mother' 'father' nan]
[' ' 'home-interview' 'GR1019' 'GR1081' 'GR1029' nan 'GR1067' 'GR1065'
 'GR1066' 'GR1065-66' 'COVID' 'GR1076' 'GR1078' 'GR1086' 'GR1060' 'GR1025'
 'GR1018' 'GR1075' 'GR1083' 'GR1064' 'GR1080' 'interview' 'GR1078-79'
 'GR1079' 'GR1002' 'GR1001-03-05' 'GR1001' 'GR1003' 'GR1005' 'GR1062'
 'GR1004' 'GR1024' 'GR1028' 'GR1032' 'GR1082' 'GR1084' 'GR1093' 'GR1096'
 'GR1094' 'GR1095' 'GR1097' 'GR1001-19']
[' ' nan 'E' 'A' 'F' 'G' 'C' 'X' 'B' 'D' 'H' 'I' 'J' 'K']
[' ' 'mother' 'father' nan 'mother & father' 'child' 'teacher']
['mother' 'score' 'child' nan 'item']


In [61]:
# ds = [ x for x in list(q.data_source.unique()) if isinstance(x, str) ]
# t = pd.DataFrame()
# for s in sorted(ds):
#     t = t.append([[s, len(q.loc[q.data_source==s,])]])
# t.to_csv('worksheet.csv')
len(q.loc[ q.data_source.isna(),])

90

In [19]:
# assign('respons', 'var_label') #, and_also=('gr_section', 'D'))

In [20]:
# set var_name as index 
# q = q.set_index('var_name')
# order by timepoint 
# q.timpoint = pd.Categorical(q.timepoint, categories=[' ','12-20 w','15-23 w','20-25 w','30 w','12-30 w','12 w - 2 m',
#                           '2 m','6 m','1 y','1.5 y','2 y','2.5 y','3 y','4 y','5 y','6 y','8 y','9 y','13 y','17 y'])
# q = q.sort_values('timepoint')

In [21]:
# ASSIGN NUMBER
# def assign_qnumber(gr, section, numbers):
#     q.loc[(q['data_source'] == gr)&(q['gr_section'] == section), 'gr_qnumber'] = numbers
    
# # GR1001-A the numbers are wrong compared to pdfs, adust:
# assign_qnumber('GR1001', 'A', # [int(x)-1 for x in o.index.str[1:3]]
#                ['3a','4','4.1','4.2','5','5.1','5.1.1','5.1.2','5.1.3','5.1.4','7','7.1','7.2','8','9'])
# # GR1001-B
# assign_qnumber('GR1001', 'B', 
#                ['4','5.1','5.2','6','7','8','9.1','9.2','9.3','9.4','9.5','9.6','9.7','9.8'])
# # GR1001-C
# assign_qnumber('GR1001', 'C', 
#  ['2.1', '2.2', '2.3', '2.4', '2.5', '3','4.1','4.2','5.1','5.2','6.1', '6.2', '7','8.1', '8.2', '8.3', '8.4', '8.5', '8.6','9',
#  '10.01', '10.02', '10.03', '10.04', '10.05', '10.06', '10.07', '10.08', '10.09', '10.10', 
#  '10.11', '10.12', '10.13', '10.14', '10.15', '10.16', '10.17', '10.18', '10.19', '10.20',
#  '11.01.1', '11.01.2','11.02.1', '11.02.2','11.03.1', '11.03.2','11.04.1', '11.04.2','11.05.1', '11.05.2',
#  '11.06.1', '11.06.2','11.07.1', '11.07.2','11.08.1', '11.08.2','11.09.1', '11.09.2','11.10.1', '11.10.2',
#  '11.11.1', '11.11.2','11.12.1', '11.12.2','11.13.1', '11.13.2','11.14.1', '11.14.2','11.15.1', '11.15.2',
#  '11.16.1', '11.16.2','11.17.1', '11.17.2','11.18.1', '11.18.2','11.19.1', '11.19.2', '12','12.1',
#  '13.01', '13.02', '13.03', '13.04', '13.05', '13.06', '13.07', '13.08', '13.09', '13.10'])
# # GR1001-D
# assign_qnumber('GR1001', 'D', 
#     ['1', '2', '3.1','3.2', '4', '5','5.1', '6','6.1', '7','7.1', '8', '9', '10', '11','11.1','11.2','11.3',
#     '12','12.1', '13','13.1','13.2', '14','14.1', '15', '15.1','16', '16.1', '17', '17.1','18', '18.1',
#     '19', '19.1','20', '20.1','21', '21.1','22', '22.1','23', '23.1','24', '24.1','25', '25.1','26', '26.1',
#     '27', '27.1','28', '28.1','29', '29.1','30', '30.1','31', '31.1',
#     '32', '32.2', '32.2', '32.2', '32.2', '32.2', '32.3', '32.4', '33', '34', '35', '36', '36.1', '36.2', 
#     '37', '37.1', '37.2'])
# # GR1001-E (characters recoded in medicine file ?)
# assign_qnumber('GR1001', 'E', ['2', '3'])
# # GR1001-F
# assign_qnumber('GR1001', 'F', ['1', '1.2', '1.1', # wrong variable name (05 not 01) so inverted, same for 2, 3
#  '2', '2.2', '2.1', '3', '3.2', '3.1', '4', '4.2', '4.1', # use maternal caffeine intake
#  '5', '5.1', '5.2', '5.3', '6', '6.1', '6.2', '6.3', '6.3.1', # use maternal alcohol consumption
#  '7', '7.1', '7.3', '8', '8.1', # use maternal smoking
#  '9', '10', '11', '11.1', '11.1.1', '11.1.2', '11.1.3', '11.1.4', '11.1.5', '11.1.6', 
#  '12', '12.1', '12.1.1', '12.1.2', '12.1.3', '12.1.4', '12.1.5', '12.1.6'])
# # GR1001-G (partner)
# assign_qnumber('GR1001', 'G',
#  ['1', '1.1', '2', '2.1', '3', '3.1', '3.1.1', '3.1.2', '3.1.3', '3.1.4', '3.1.5', '3.1.6', 
#   '4', '5', '5.1', '6', '6.1', '7', '7.1'])
# # GR1001-H
# assign_qnumber('GR1001', 'H', ['1', '2', '3', '4', '5', '6', 
#  '7.01','7.02','7.03','7.04','7.05','7.06','7.07','7.08','7.09','7.10','7.11','7.12','7.13',
#  '8.1','8.2','8.3','8.4'])
# # GR1001-I
# assign_qnumber('GR1001', 'I', ['3', '3.1', '3.1.1', '3.1.2', '3.1.3', '3.1.4', '3.1.5', 
#                                '4', '4.1', '4.1.1', '4.1.2', '4.1.3', '4.1.4', '4.1.5', 
#                                '5', '5.1', '5.1.1', '5.1.2', '5.1.3', '5.1.4', '5.1.5',
#                                '6', '6.1', '6.1.1', '6.1.2', '6.1.3', '6.1.4', '6.1.5', 
#                                '7', '7.1', '7.1.1', '7.1.2', '7.1.3', '7.1.4', '7.1.5',
#                                '8', '8.1', '8.1.1', '8.1.2', '8.1.3', '8.1.4', '8.1.5',
#                                '9', '9.1', '9.1.1', '9.1.2', '9.1.3', '9.1.4', '9.1.5', 
#                                '10','10.1','10.1.1','10.1.2','10.1.3','10.1.4','10.1.5', 
#                                '11','11.1','11.1.1','11.1.2','11.1.3','11.1.4','11.1.5', 
#                                '12','12.1','12.1.1','12.1.2','12.1.3','12.1.4','12.1.5',
#                                '13','13.1','13.1.1','13.1.2','13.1.3','13.1.4','13.1.5',
#                                '15','15.1','15.1.1','15.1.2', 
#                                '16','16.1','16.1.1','16.1.2'])
# # GR1001-J
# assign_qnumber('GR1001', 'J', 
#                ['7','8','9.1','9.2','9.3','10.1','10.2','11.1','11.2','11.3','15', '17'])

In [22]:
# def assign_reference(gr, section, name, doi, between=(), qnumbers=()):
#     if len(between)>1:
#         q.loc[(q['data_source'] == gr)&(q['gr_section'] == section)&(q['gr_qnumber'].between(between[0],between[1])), 
#           ['questionnaire', 'questionnaire_ref']] = [name, doi]
#     else:
#         q.loc[(q['data_source'] == gr)&(q['gr_section'] == section)&(q['gr_qnumber'].isin(qnumbers)), 
#           ['questionnaire', 'questionnaire_ref']] = [name, doi]

# assign_reference('GR1001', 'C', between=('2.1','2.5'), name='EuroQol health-related quality of life',
#                  doi='https://doi.org/10.1016/0168-8510(90)90421-9')
# assign_reference('GR1001', 'C', qnumbers=('3','4.1','4.2','5.1','5.2','6.1','6.2','7','9'), 
#                  name='12-Item Short-Form Health Survey',
#                  doi='https://doi.org/10.1097/00005650-199603000-00003')
# assign_reference('GR1001', 'C', between=('8.1','8.6'), name='36-Item Short-Form Health Survey (SF-36)',
#                  doi='https://doi.org/10.1080/00207411.1994.11449283')
# assign_reference('GR1001', 'H', between=('1','6'), name='Neonatal Perception Inventory (NPI)',
#                  doi='https://doi.org/10.1007/BF01434585')
# assign_reference('GR1001', 'H', between=('7.01','7.13'), name='Pregnancy Outcome Questionnaire (POQ)',
#                  doi='https://doi.org/10.1097/00004583-198805000-00004')


In [23]:
# def assign_section_title(gr, section, name):
#     q.loc[(q['data_source'] == gr)&(q['gr_section'] == section), 'constructs'] = name

# gr1001 = {'A':'THIS PREGNANCY', 'B':'PREVIOUS PREGNANCIES', 'C':'YOUR HEALTH IN THIS PREGNANCY', 
#           'D':'GENERAL HEALTH', 'E':'VITAMINS AND MEDICINES', 'F':'YOUR LIFESTYLE', 
#           'G':'LIFESTYLE OF THE BIOLOGICAL FATHER IN THE MONTHS PRECEDING THE PREGNANCY', 
#           'H':'EXPECTATIONS CONCERNING THE PREGNANCY', 'I':'MEDICAL HISTORY OF YOUR FAMILY', 
#           'J':'BACKGROUND INFORMATION'}

# for sect in gr1001.keys():
#     assign_section_title('GR1001', sect, gr1001[sect])


In [24]:
# # in the CBCL @3 file define scores and items 
# q.loc[(q['orig_file'] =='CBCL_3_incl_Tscores__GR1065E2_GR1066A1_20201111') & q.index.str.contains(r'[a-z]_36'), 
#       'var_comp'] = 'score'
# q.loc[(q['orig_file'] =='CBCL_3_incl_Tscores__GR1065E2_GR1066A1_20201111') & q.index.str.contains(r'[0-9]_36'), 
#       'var_comp'] = 'item'

# # in the BRIEF file define scores and items 
# q.loc[(q['orig_file'] =='BRIEF - 48 months_22112016') & q.index.str.contains('brief_'),
#       'var_comp'] = 'score'
# q.loc[(q['orig_file'] =='BRIEF - 48 months_22112016') & q.index.str.contains(r'brief[0-9]'),
#       'var_comp'] = 'item'

# # employment @5 and 13
# q.loc[q.index.str.contains('employstat'),'var_comp'] = 'score'

# # CBCL @5
# q.loc[(q['orig_file'] =='CHILDCBCL_6_incl_Tscores_20201111; CHILDCBCL5_17072015') & q.index.str.contains(r'[a-z]_5'),
#      'var_comp'] = 'score'
# q.loc[(q['orig_file'] =='CHILDCBCL_6_incl_Tscores_20201111; CHILDCBCL5_17072015') & q.index.str.contains(r'[0-9]_5'),
#      'var_comp'] = 'item'
# q.loc[q['orig_file'] =='CHILDCBCL_6_incl_Tscores_20201111', 'var_comp'] = 'score'

In [25]:
# print(list(GR_ids.keys()))
# n= '97'
# for i in q.index[q.index.str.contains(r'^[A-Za-z]\d{5}'+n), ]:
#     print(i)
# print(q.loc[q.index.str.contains(r'^[A-Za-z]\d{5}'+n), 'data_source'].unique())
# print(q.loc[q.index.str.contains(r'^[A-Za-z]\d{5}'+n), 'orig_file'].unique())

# pd.set_option('max_rows', None)
# pd.set_option('max_colwidth', 10)

# q.loc[q.index.str.contains(r'^[A-Za-z]\d{5}'+n) & q.data_source.isna(),]
# q.loc[q.index.str.contains(r'^[A-Za-z]\d{5}'+n) & q.orig_file.str.contains('GR1066'),]
# q.loc[(q['orig_file'] =='GR1082_C1-5_22092016'),]

In [27]:
# len(q.loc[ q.data_source.isna(),])/len(q)

In [28]:
# q.data_source.value_counts().sort_index()
# q.loc[ q.n_observed >10, ]

In [29]:
# q.loc[ ~(q.data_source.isna())  & q.gr_section.isna(),]# & q.timepoint.isna(),]

In [30]:
q.to_csv('quest_meta.csv')

# q.to_csv('/Users/Serena/Desktop/GENR-search-engine/GenR-metadata-app/metadata-shiny-app/data/quest_meta.csv')

In [31]:
# t = q.loc[q['data_source'] == 'GR1003']
# t.to_csv('check.csv')