# Extract data downloaded from https://ncg-task.github.io/data.html 

In [41]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from torch.utils.data import Dataset
import torch
import os
from constants import BASE_DIR

In [47]:
def get_sentences_list(file_name):
    """ Extract list of contributing sentences from sentences.txt in each sub-diretory"""
    with open(file_name) as f:
        lines = f.readlines()
    sentence_list = [int(line) for line in lines]
    return sentence_list

In [43]:
def get_document_sentence_target(file_name, sentence_list):
    """ Get list of sentences and their target values for a given file in stanza format. 
        Target value 1=contributing sentence, 0=not contributing sentence """
    with open(file_name) as f:
        lines = f.readlines()
    sentences = []
    sentences_target = []
    line_count = 0
    for line in lines:
        if line.endswith('.\n'):
            line_count+=1
            target = 1 if line_count in sentence_list else 0
            sentences.append(line.strip())
            sentences_target.append(target)
            
    return sentences, sentences_target


In [46]:
def get_document_abstract(file_name):
    """ Extract the abstract from given file in stanza format""" 
    with open(file_name) as f:
        lines = f.readlines()
    abstract = []
    start_abstract = False
    abstract_lines_count = 0
    for line in lines:
        if line.lower().strip() == 'abstract':
            start_abstract = True
            continue
        if start_abstract:
            if not line.strip().endswith('.') and abstract_lines_count > 2:
                start_abstract = False
                break
            abstract_lines_count += 1
            abstract.append(line.strip())
    return ' '.join(abstract)


## Fetch data from trial-data folder by looping through various sub-directories and convert into a dataframe 

In [45]:
data_path = os.path.join(BASE_DIR, 'trial-data')
dirs = os.listdir(data_path)
doc_sentence_list = {}
sentences_text_list = []
sentences_target_list = []
doc_num_list = []
doc_path_list = []
abstract_list = []
doc_num = 0
for item in dirs:
    if item != 'README.md' and item != '.DS_Store':
        sub_dirs = os.listdir(os.path.join(data_path, item))
        for sub_dir in sub_dirs:
            if sub_dir == '.DS_Store':
                continue
            info_path = os.path.join(data_path, item, sub_dir)
            sentences_file = os.path.join(info_path, 'sentences.txt')
            sentence_list = get_sentences_list(sentences_file)
            doc_sentence_list[doc_num] = sentence_list
            for info_file in os.listdir(info_path):
                if sub_dir == '.DS_Store':
                    continue
                if info_file.endswith('Stanza-out.txt'):
                    stanza_out_text_path = os.path.join(info_path, info_file)
                    sentences_text, sentences_target = get_document_sentence_target(stanza_out_text_path, sentence_list)
                    sentences_text_list.extend(sentences_text)
                    sentences_target_list.extend(sentences_target)
                    doc_num_list.extend([doc_num] * len(sentences_target))
                    doc_path_list.extend([stanza_out_text_path] * len(sentences_target))
                    abstract = get_document_abstract(stanza_out_text_path)
                    abstract_list.extend([abstract] * len(sentences_target))
                    doc_num +=1
                    break
data_df = pd.DataFrame({'doc_num': doc_num_list, 'sentence': sentences_text_list, 'target': sentences_target_list,
                       'doc_path': doc_path_list, 'abstract': abstract_list})
data_df.to_csv(os.path.join(BASE_DIR, 'generated_data', 'data.csv'), index=False)
data_df.head()

Unnamed: 0,doc_num,sentence,target,doc_path,abstract
0,0,"The reading comprehension task , that asks que...",0,/Users/rohantondulkar/Projects/Typeset/trial-d...,"The reading comprehension task , that asks que..."
1,0,Recent formulations of this task have typicall...,1,/Users/rohantondulkar/Projects/Typeset/trial-d...,"The reading comprehension task , that asks que..."
2,0,"However , Rajpurkar et al . ( 2016 ) recently ...",0,/Users/rohantondulkar/Projects/Typeset/trial-d...,"The reading comprehension task , that asks que..."
3,0,"In this paper , we focus on this answer extrac...",1,/Users/rohantondulkar/Projects/Typeset/trial-d...,"The reading comprehension task , that asks que..."
4,0,We show that scoring explicit span representat...,0,/Users/rohantondulkar/Projects/Typeset/trial-d...,"The reading comprehension task , that asks que..."
