# Do Skimlit project

### First, change text data into readable table data

In [35]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

In [70]:
# Const variables

# pass
DIR_200K = "dataset/pubmed-rct/PubMed_200k_RCT"
DIR_200K_replaced_nums = "dataset/pubmed-rct/PubMed_200k_RCT_numbers_replaced_with_at_sign"
DIR_20K = "dataset/pubmed-rct/PubMed_20k_RCT"
DIR_20K_replaced_nums = "dataset/pubmed-rct/PubMed_200k_RCT_numbers_replaced_with_at_sign"

DEV_TXT = 'dev.txt'
TEST_TXT = 'test.txt'
TRAIN_TXT = 'train.txt'
TRAIN_ZIP = 'train.zip'

In [37]:
with open(os.path.join(DIR_20K, DEV_TXT), 'r') as f:
    text = f.readlines()
text[:20]

['###24290286\n',
 'BACKGROUND\tIgE sensitization to Aspergillus fumigatus and a positive sputum fungal culture result are common in patients with refractory asthma .\n',
 'BACKGROUND\tIt is not clear whether these patients would benefit from antifungal treatment .\n',
 'OBJECTIVE\tWe sought to determine whether a 3-month course of voriconazole improved asthma-related outcomes in patients with asthma who are IgE sensitized to A fumigatus .\n',
 'METHODS\tAsthmatic patients who were IgE sensitized to A fumigatus with a history of at least 2 severe exacerbations in the previous 12 months were treated for 3 months with 200 mg of voriconazole twice daily , followed by observation for 9 months , in a double-blind , placebo-controlled , randomized design .\n',
 'METHODS\tPrimary outcomes were improvement in quality of life at the end of the treatment period and a reduction in the number of severe exacerbations over the 12 months of the study .\n',
 'RESULTS\tSixty-five patients were randomiz

Reading the lines from the training text file results in a list of strings containing different abstract samples, the sentences in a sample along with the role the sentence plays in the abstract.

The role of each sentence is prefixed at the start of each line separated by a tab (\t) and each sentence finishes with a new line (\n).

Different abstracts are separated by abstract ID's (lines beginning with ###) and newlines (\n).

Knowing this, it looks like we've got a couple of steps to do to get our samples ready to pass as training data to our future machine learning model.

Let's write a function to perform the following steps:

* Take a target file of abstract samples.
* Read the lines in the target file.
* For each line in the target file:
    * If the line begins with ### mark it as an abstract ID and the beginning of a new abstract.
        * Keep count of the number of lines in a sample.
    * If the line begins with \n mark it as the end of an abstract sample.
        * Keep count of the total lines in a sample.
    * Record the text before the \t as the label of the line.
    * Record the text after the \t as the text of the line.
* Return all of the lines in the target text file as a list of dictionaries containing the key/value pairs:
    * `"ID"` - the ID of text
    * `"line_number"` - the position of the line in the abstract (e.g. `3`).
    * `"target"` - the role of the line in the abstract (e.g. `OBJECTIVE`).
    * `"text"` - the text of the line in the abstract.
    * `"total_lines"` - the total lines in an abstract sample (e.g. `14`).
* Abstract ID's and newlines should be omitted from the returned preprocessed data.

Create dataset like this.

| ID     | line_number | discourse_type | discourse_text | total_lines
| :--- | :--- | :--- | :--- | :--- |
| 24290286 | 0 | BACKGROUND | IgE sensitization to Aspergillus fumigatus and a positive sputum fungal culture result are common in patients with refractory asthma . | 10 |


In [38]:
# text data
ID_STR = "###"
RETURN = "\n"
TAB = "\t"

# Labels Const
LABEL_ID = "ID"
LABEL_LINE_NUM = "line_number"
LABEL_TYPE = "discourse_type"
LABEL_TEXT = "discourse_text"
LABEL_TOTAL_LINE_NUM = "total_lines"
LABEL_DATA = "data"

In [58]:
def create_dataset(text_data):
    train_text = []  # return value 
    
    sentence_id = ''  # ID
    line_text = []    # each line data
    line_index = 0    # each line_index
    line_total = 0    # sentence line count 
    
    # loop through each line in target text 
    for i in range(len(text_data)): 
        
        if text_data[i].startswith(ID_STR):
            # when the line is the ###, this is the start of sentence
            sentence_id = text_data[i][len(ID_STR):].rstrip(RETURN)  
        elif text_data[i] == RETURN:
            # when the line is the end of sentence.
            # set all data into return value
            for line in line_text:
                line_data = {}
                line_data[LABEL_ID] = line[LABEL_ID]
                line_data[LABEL_LINE_NUM] = line[LABEL_LINE_NUM]
                line_data[LABEL_TYPE] = line[LABEL_TYPE]
                line_data[LABEL_TEXT] = line[LABEL_TEXT]
                line_data[LABEL_TOTAL_LINE_NUM] = line_total
                train_text.append(line_data)
            
            # reset line counts
            line_index = 0
            line_total = 0
            line_text = []
            
        else:
            # when the line is sentence to read
            row_data = {}  # the line data
            row_data[LABEL_ID] = id_num if id_num != '' else "Nan"  # ID
            row_data[LABEL_LINE_NUM] = line_index  # line_number
            row_data[LABEL_TYPE] = text_data[i].rstrip(RETURN).split(TAB)[0]  # discourse_type
            row_data[LABEL_TEXT] = text_data[i].rstrip(RETURN).split(TAB)[1]  # discourse_text
            line_text.append(row_data)  # add to each line data
            
            line_index += 1 # add line number
            line_total += 1 # add line cout
            
    # set the rest lines
    for line in line_text:
        line_data = {}
        line_data[LABEL_ID] = line[LABEL_ID]
        line_data[LABEL_LINE_NUM] = line[LABEL_LINE_NUM]
        line_data[LABEL_TYPE] = line[LABEL_TYPE]
        line_data[LABEL_TEXT] = line[LABEL_TEXT]
        line_data[LABEL_TOTAL_LINE_NUM] = line_total
        train_text.append(line_data)
    
    return train_text

In [61]:
# make csv dir
os.makedirs('dataset/csv/', exist_ok=True)

In [72]:
%%time
for txt_file in (DEV_TXT, TEST_TXT, TRAIN_TXT):
    with open(os.path.join(DIR_20K, txt_file), 'r') as f:
        pd_data = pd.json_normalize(create_dataset(f.readlines()))
        pd_data.to_csv('dataset/csv/' + txt_file.replace('.txt', '.csv'))


Wall time: 5.77 s
