### Libraries

In [1]:
import boto3
import pandas as pd
import os
import numpy as np
import nltk
from nltk import tokenize
import tempfile
import openai
import csv

### Fetch file from S3 Bucket

In [2]:
# Define bucket object
s3 = boto3.client('s3')

# Change the file you want to download accordingly
with open('text_data.xlsx', 'wb') as f:
    s3.download_fileobj('tiiqu-openresearch', 'data-ingression/climate_risk_f1000.xlsx', f)

### Convert Paragraphs to Q&A

In [3]:
# Read in the excel file
text_data = pd.read_excel('text_data.xlsx')
text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              169 non-null    object
 1   DOI                172 non-null    object
 2   Authors            169 non-null    object
 3   Full_Text_URL      172 non-null    object
 4   Full_Text_Content  169 non-null    object
dtypes: object(5)
memory usage: 6.8+ KB


In [4]:
text_data.sample(1)

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Full_Text_Content
164,"Prevalence, causes and impacts of human traffi...",10.12688/f1000research.124460.2,"[{'Khan', 'Zeeshan'}, {'Mohammad Rahim', 'Kama...",https://f1000research.com/extapi/article/xml?d...,\n\nIntroduction\nHuman trafficking is one of ...


In [5]:
text_data[text_data.isnull().any(axis=1)]

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Full_Text_Content
56,,10.12688/f1000research.70650.3,,https://f1000research.com/extapi/article/xml?d...,
66,,10.12688/f1000research.111073.1,,https://f1000research.com/extapi/article/xml?d...,
154,,10.12688/f1000research.124234.1,,https://f1000research.com/extapi/article/xml?d...,


In [6]:
# Drop the nan value records since the url is invalid 
text_data = text_data.dropna().reset_index(drop=True)

In [7]:
# Create copy of text_data to mainpulate the data further
temp_df = text_data.copy()

In [8]:
# Function to tokenize the text and return the paragraphs
def get_paragraphs(text):
  
  # Tokenize the text
  result = tokenize.sent_tokenize(text)

  # Variables to store the tokenized strings and 
  str_paragraph = ''
  paragraphs = []

  for i in range(len(result)):

    sentence = result[i]
    len_para = len(tokenize.word_tokenize(str_paragraph))

    if len_para < 200:
      str_paragraph = str_paragraph + ' ' + sentence

    elif len_para >= 200 :
      paragraphs.append(str_paragraph)

      str_paragraph = ''
      str_paragraph = str_paragraph + ' ' + sentence

    elif i == len(result) - 1:
      paragraphs.append(str_paragraph)
    
  # Return the paragraphs
  return paragraphs

In [28]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0




In [29]:
from dotenv import load_dotenv

# Load the OpenAI API Key
# Create a .env file storing your key
load_dotenv('openai_key.env')

True

In [30]:
# Define the api key
openai.api_key = os.environ.get('OPENAI_API_KEY')

# Function to get the response from OpenAI's api
def get_response(text):
    try:
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt="Generate 2 descriptive questions and answers per paragraph wise: " + text,
            temperature=0.9,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text
    except Exception as e:
        print("\nError:", str(e))
        return None

In [10]:
# Function to get the Q&A pairs from the qna generated
def parse_qa_pairs(lines):
    pairs = []
    current_q = ''
    current_a = ''
    for line in lines:
        line = line.strip()
        if line.startswith('Q1') or line.startswith('Q2') or line.startswith('Question 1') or line.startswith('Question 2'):
 
            if current_q:
                pairs.append((current_q, current_a))
            current_q = line
            current_a = ''
        elif line.startswith('A1') or line.startswith('A2') or line.startswith('Answer'):
           
            current_a += line[2:] + '\n'
  
    if current_q:
        pairs.append((current_q, current_a))
    return pairs

In [22]:
%%time
# Create a temporary DataFrame to store the cleaned data
columns = ['Title', 'DOI', 'Authors', 'Full_Text_URL', 'Question', 'Answer']
cleaned_data = pd.DataFrame(columns=columns)

for id, row in temp_df.iterrows():
    paragraphs = get_paragraphs(row['Full_Text_Content'])
    for para in paragraphs:
        qna = []
        response = get_response(para)
        if response != None:
            qna.append(response)
        else:
            print('\nstopping Q&A generation. . .\n')
            break

        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
            for qa in qna:
                print(qa)
                temp_file.write(qa)
            temp_file.flush()
            temp_file_path = temp_file.name

        with open(temp_file_path, 'r') as temp_file:
            lines = temp_file.readlines()
            pairs = parse_qa_pairs(lines)
        
        new_rows = []
        for Q, A in pairs:
            if Q.startswith('Q1') or Q.startswith('Q2') or Q.startswith('Question 1') or Q.startswith('Question 2'):
                new_rows.append({'Title': row['Title'], 'DOI': row['DOI'], 'Authors': row['Authors'], 'Full_Text_URL': row['Full_Text_URL'], 'Question': Q, 'Answer': A.strip()})
            elif Q.startswith('A1') or Q.startswith('A2') or Q.startswith('Answer'):
                new_rows.append({'Title': row['Title'], 'DOI': row['DOI'], 'Authors': row['Authors'], 'Full_Text_URL': row['Full_Text_URL'], 'Question': A.strip(), 'Answer': Q})
        
        cleaned_data = pd.concat([cleaned_data, pd.DataFrame(new_rows)], ignore_index=True)

# Clean the data in the DataFrame
cleaned_data['Question'] = cleaned_data['Question'].str.replace(r'^[Question\d\.\:\- ]*', '', regex=True)
cleaned_data['Answer'] = cleaned_data['Answer'].str.replace(r'^[Answer\d\.\:\- ]*', '', regex=True)

# Write the cleaned data to the output CSV file
cleaned_data.to_csv('output_clean.csv', index=False)


 What is payment by results (PbR) and how does it help the NHS?
Answer: Payment by results (PbR) is a system introduced in England in the NHS Plan of July 2000. It links the allocation of hospital funds to the activity they undertake, providing incentive payments to hospitals. This system helps the NHS manage costs and incentivises hospitals to increase activity, which should ultimately lead to improved services for patient care. 

4. How is the system of Diagnostic Related Groups (DRGs) used?
Answer: Diagnostic Related Groups (DRGs) are a system used to classify hospital cases into identifiable groups in order to identify ‘products’ that a hospital provides. The system is based on the International Classification of Diseases (ICD) and the presence of complications or co-morbidity. It has been adopted as the unit of payments to hospitals in many health systems, replacing ‘cost-based’ reimbursement.


Question 1: What was the main aim of PbR?
Answer 1: The main aim of PbR was to improve

In [23]:
df = pd.read_csv('output_clean.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5732 entries, 0 to 5731
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          5732 non-null   object
 1   DOI            5732 non-null   object
 2   Authors        5732 non-null   object
 3   Full_Text_URL  5732 non-null   object
 4   Question       5732 non-null   object
 5   Answer         5712 non-null   object
dtypes: object(6)
memory usage: 268.8+ KB


In [24]:
df.sample(10)

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Question,Answer
1273,Analysis of environmental migrants and their m...,10.12688/f1000research.27272.1,"[{'Herrán', 'Keren'}, {'Biehler', 'Dawn'}]",https://f1000research.com/extapi/article/xml?d...,What challenges do children face when dealing ...,Children face challenges due to their cognitiv...
2739,Green taxes as ecosystem conservation: an anal...,10.12688/f1000research.127058.1,"[{'Walter Gregorio', 'Ibarra Fretell'}, {'GRIJ...",https://f1000research.com/extapi/article/xml?d...,What metric was used to measure the reliabilit...,The reliability of the instrument was measured...
1390,"Option-based guarantees to accelerate urgent, ...",10.12688/f1000research.26482.2,"[{'David', 'Manheim'}, {'Derek', 'Foster'}]",https://f1000research.com/extapi/article/xml?d...,What benefits do recipients of put options rec...,Recipients of these options benefit from the p...
4613,Perspectives on automated composition of workf...,10.12688/f1000research.54159.1,"[{'Anna-Lena', 'Lamprecht'}, {'Palmblad', 'Mag...",https://f1000research.com/extapi/article/xml?d...,How can bio.tools find a balance between being...,community registry like bio.tools needs to ref...
791,Factors influencing the higher incidence of tu...,10.12688/f1000research.14476.2,"[{'Hayward', 'Sally'}, {'Harding', 'Rosalind M...",https://f1000research.com/extapi/article/xml?d...,How has the transnational turn impacted the ri...,The transnational turn has provided a new anal...
2665,Viewing the global health system as a complex ...,10.12688/f1000research.126201.1,"[{'Josephine', 'Borghi'}, {'Ismail', 'Sharif'}...",https://f1000research.com/extapi/article/xml?d...,What is the first step to operationalizing a C...,The first step to operationalizing a CAS appro...
77,Differences in antibiotic use and knowledge be...,10.12688/f1000research.2-108.v2,"[{'Quizhpe P', 'Arturo'}, {'Martyna', 'Gassows...",https://f1000research.com/extapi/article/xml?d...,What is the aim of this study?,The aim of this study was to investigate wheth...
1728,Pediatric malaria incidence and risk mapping i...,10.12688/f1000research.75923.1,"[{'Ferrao', 'Joao'}, {'Mendes', 'Roberto'}, {'...",https://f1000research.com/extapi/article/xml?d...,What was the peak age of cases from 1 to 4 yea...,The peak age of cases reported was from 1 to 4...
3673,\nPrima facie reasons to question enclosed int...,10.12688/f1000research.10497.1,"[{'Halpert', 'Madeleine-Thérèse'}, {'M. Jahi',...",https://f1000research.com/extapi/article/xml?d...,"How has IPPs, such as plant and utility patent...",The applications for plant and utility patents...
1524,Using agricultural metadata: a novel investiga...,10.12688/f1000research.26903.2,"[{'Walters', 'Judi'}, {'Light', 'Kate'}, {'Rob...",https://f1000research.com/extapi/article/xml?d...,What did Farre et al. (2019) use to establish ...,Farre et al. (2019) used APSIM-canola simulati...


In [25]:
df.describe()

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Question,Answer
count,5732,5732,5732,5732,5732,5712
unique,169,169,169,169,5717,5701
top,The demographic effects and public health infr...,10.12688/f1000research.121780.1,"[{'Andrew Kweku', 'Conduah'}]",https://f1000research.com/extapi/article/xml?d...,What is the purpose of this study?,%.
freq,50,50,50,50,4,9


In [27]:
df[df['Answer']=='%.']

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Question,Answer
519,Epidemiology of adulthood drowning deaths in B...,10.12688/f1000research.10980.1,"[{'Hossain', 'Mohammad Jahangir'}, {'Animesh',...",https://f1000research.com/extapi/article/xml?d...,What percentage of the drowning occurred in na...,%.
520,Epidemiology of adulthood drowning deaths in B...,10.12688/f1000research.10980.1,"[{'Hossain', 'Mohammad Jahangir'}, {'Animesh',...",https://f1000research.com/extapi/article/xml?d...,What percentage of fatalities occurred in a pl...,%.
1727,Pediatric malaria incidence and risk mapping i...,10.12688/f1000research.75923.1,"[{'Ferrao', 'Joao'}, {'Mendes', 'Roberto'}, {'...",https://f1000research.com/extapi/article/xml?d...,What percentage of malaria cases were reported...,%.
3927,Enhancing energy literacy in children using zn...,10.12688/f1000research.13228.1,"[{'Polikovsky', 'Mark'}, {'Avigdor', 'Sharon'}...",https://f1000research.com/extapi/article/xml?d...,What percentage of participants mentioned sola...,%.
3928,Enhancing energy literacy in children using zn...,10.12688/f1000research.13228.1,"[{'Polikovsky', 'Mark'}, {'Avigdor', 'Sharon'}...",https://f1000research.com/extapi/article/xml?d...,What percentage of participants mentioned pota...,%.
5424,Comparison of sleep and health behaviours amon...,10.12688/f1000research.19678.3,"[{'Chudchawal', 'Juntarawijit'}, {'Yuwayong', ...",https://f1000research.com/extapi/article/xml?d...,What is the proportion of T2DM whose sleep hou...,%.
5425,Comparison of sleep and health behaviours amon...,10.12688/f1000research.19678.3,"[{'Chudchawal', 'Juntarawijit'}, {'Yuwayong', ...",https://f1000research.com/extapi/article/xml?d...,What is the proportion of non-T2DM whose sleep...,%.
5714,The disconnect between researcher ambitions an...,10.12688/f1000research.28324.3,"[{'Kelly', 'Andrew'}, {'Gardner', 'Victoria'},...",https://f1000research.com/extapi/article/xml?d...,What is the percentage of respondents who sele...,%.
5715,The disconnect between researcher ambitions an...,10.12688/f1000research.28324.3,"[{'Kelly', 'Andrew'}, {'Gardner', 'Victoria'},...",https://f1000research.com/extapi/article/xml?d...,What percentage of respondents who selected ei...,%.
