### Running instructions

1. Read the `README.md` file
2. Place the data-files in the `data` directory
3. Select a Python kernel for this Notebook
4. Run all cells
5. Wait ~10 min for full execution
6. Go to `pipeline_part_2`

In [None]:
# External modules
import os
import sys

# Local modules
sys.path.append(os.getcwd())
from python_utilities import *

In [None]:
'''
EXTRACT QUESTION DETAILS FROM KOTLIN FILE AS PYTHON DICTIONARY
'''

input_filename = "FemaleQuestionnaire.kt"
output_filename = "questionnaire_structure.dict"

extract_questionnaire_structure(input_filename, output_filename)

In [None]:
'''
BUILD DICTIONARY OF QUESTIONNAIRES
'''

input_filename = "questionnaire_result"
output_filename = "questionnaire_result.dict"

build_dictionary(input_filename, output_filename)

In [None]:
'''
CREATE PANDAS DATAFRAME FROM PYTHON DICTIONARY
'''

input_filename = "questionnaire_result.dict"
output_filename = "questionnaire_result.df"

build_database(input_filename, output_filename)

In [None]:
'''
GENERATE FIGURES

NOTE: Execution time ~7 min (Wordclouds have an execution time of ~1 min each due to translation)
'''

questionnaire_result_filename = "questionnaire_result.df"
questionnaire_structure_filename = "questionnaire_structure.dict"

with open(data_path + questionnaire_result_filename, mode='rb') as data_file:
    df = pickle.load(data_file)

    endo_df = df[df['diagnosis'] == "endometriosis"]
    no_diag_df = df[df['diagnosis'] == "no_diagnosis"]
    other_df = df[df['diagnosis'] == "other_diagnosis"]

    subset_labels = ['No diagnosis', 'Endometriosis', 'Other diagnosis']

    questions = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 
             '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', 
             '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', 
             '31', '52', '53', '54', '55', '56', '101', '102', '180']
    
    for question in questions:
        generate_figure(structure_filename = questionnaire_structure_filename,
                        no_diag_df = no_diag_df,
                        endo_df = endo_df,
                        other_df = other_df,
                        subset_labels = subset_labels,
                        question_id = question)

In [None]:
import sys

def asint(s):
    try: return int(s), ''
    except ValueError: return sys.maxsize, s

a = {'100':'12','6':'5','88':'3','test':'34', '67':'7','1':'64' }
sortedlist = [(k, a[k]) for k in sorted(a, key=asint)]
print(sortedlist)


In [None]:
'''
PEARSON'S CHI-SQUARED TEST OF HOMOGENITY
'''

questionnaire_result_filename = "questionnaire_result.df"
questionnaire_structure_filename = "questionnaire_structure.dict"

questions = ['2', '3', '6', '7', '8', '9', '10', '11','12','13','14','15',
             '16','23','24','25','26','27','28','29','30','31','101','102']

chi_df = chi_test(questionnaire_result_filename, questionnaire_structure_filename, questions)

print(chi_df[['question_id', 'pvalue', 'H0', 'Result']].to_markdown(index=False))
dfi.export(chi_df[['question_id', 'pvalue', 'H0', 'Result']].style.hide(axis='index'), media_path + 'chi_table.png', table_conversion="matplotlib")

In [None]:
'''
PRE-PROCESS DATABASE FOR ANALYSIS IN R PIPELINE
NOTE: Execution time ~1min 30s due to value remapping
'''

data_filename = "questionnaire_result.df"
structure_filename = "questionnaire_structure.dict"
master_filename = "questionnaire.csv"
food_filename = "food.csv"

pre_process_database(data_filename, structure_filename, master_filename, food_filename)