### Libraries

In [1]:
import boto3
import pandas as pd
import os
import numpy as np
import nltk
from nltk import tokenize
import tempfile
import openai
import csv

### Fetch file from S3 Bucket

In [2]:
# Define bucket object
s3 = boto3.client('s3')

# Change the file you want to download accordingly
with open('text_data.xlsx', 'wb') as f:
    s3.download_fileobj('tiiqu-openresearch', 'data-ingression/climate_risk_f1000.xlsx', f)

### Convert text data to paragraphs

In [3]:
# Read in the excel file
text_data = pd.read_excel('text_data.xlsx')
text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              169 non-null    object
 1   DOI                172 non-null    object
 2   Authors            169 non-null    object
 3   Full_Text_URL      172 non-null    object
 4   Full_Text_Content  169 non-null    object
dtypes: object(5)
memory usage: 6.8+ KB


In [4]:
text_data.sample(1)

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Full_Text_Content
59,Green Rating for Integrated Habitat Assessment...,10.12688/f1000research.108826.1,"[{'Priyanka', 'Kochhar'}, {'Namrata', 'Mahal'}...",https://f1000research.com/extapi/article/xml?d...,\n\n1. Introduction\nBuildings contribute to 3...


In [5]:
text_data[text_data.isnull().any(axis=1)]

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Full_Text_Content
56,,10.12688/f1000research.70650.3,,https://f1000research.com/extapi/article/xml?d...,
66,,10.12688/f1000research.111073.1,,https://f1000research.com/extapi/article/xml?d...,
154,,10.12688/f1000research.124234.1,,https://f1000research.com/extapi/article/xml?d...,


In [6]:
# Drop the nan value records since the url is invalid 
text_data = text_data.dropna().reset_index(drop=True)

In [7]:
# Create copy of text_data to mainpulate the data further
temp_df = text_data.copy()

In [8]:
# Function to tokenize the text and return the paragraphs
def get_paragraphs(text):
  
  # Tokenize the text
  result = tokenize.sent_tokenize(text)

  # Variables to store the tokenized strings and 
  str_paragraph = ''
  paragraphs = []

  for i in range(len(result)):

    sentence = result[i]
    len_para = len(tokenize.word_tokenize(str_paragraph))

    if len_para < 200:
      str_paragraph = str_paragraph + ' ' + sentence

    elif len_para >= 200 :
      paragraphs.append(str_paragraph)

      str_paragraph = ''
      str_paragraph = str_paragraph + ' ' + sentence

    elif i == len(result) - 1:
      paragraphs.append(str_paragraph)
    
  # Return the paragraphs
  return paragraphs

In [9]:
import openai

# Define the api key
openai.api_key = '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' #Make sure you use your own api key.

# Function to get the response from OpenAI's api
def get_response(text):

    response = openai.Completion.create(
      model="text-davinci-003",
      prompt="Generate 2 descriptive questions and answers per paragraph wise: "+text,
      temperature=0.9,
      max_tokens=200,   # Changing the value from 150 to 200
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    
    r = response.choices[0].text
    return r

In [10]:
# Function to get the Q&A pairs from the qna generated
def parse_qa_pairs(lines):
    pairs = []
    current_q = ''
    current_a = ''
    for line in lines:
        line = line.strip()
        if line.startswith('Q1') or line.startswith('Q2') or line.startswith('Question 1') or line.startswith('Question 2'):
 
            if current_q:
                pairs.append((current_q, current_a))
            current_q = line
            current_a = ''
        elif line.startswith('A1') or line.startswith('A2') or line.startswith('Answer'):
           
            current_a += line[2:] + '\n'
  
    if current_q:
        pairs.append((current_q, current_a))
    return pairs

In [11]:
# Create a temporary DataFrame to store the cleaned data
columns = ['Title', 'DOI', 'Authors', 'Full_Text_URL', 'Question', 'Answer']
cleaned_data = pd.DataFrame(columns=columns)

for id, row in temp_df[:1].iterrows():
    paragraphs = get_paragraphs(row['Full_Text_Content'])
    for para in paragraphs:
        qna = []
        response = get_response(para)
        qna.append(response)

        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
            for qa in qna:
                print(qa)
                temp_file.write(qa)
            temp_file.flush()
            temp_file_path = temp_file.name

        with open(temp_file_path, 'r') as temp_file:
            lines = temp_file.readlines()
            pairs = parse_qa_pairs(lines)
        
        new_rows = []
        for Q, A in pairs:
            if Q.startswith('Q1') or Q.startswith('Q2') or Q.startswith('Question 1') or Q.startswith('Question 2'):
                new_rows.append({'Title': row['Title'], 'DOI': row['DOI'], 'Authors': row['Authors'], 'Full_Text_URL': row['Full_Text_URL'], 'Question': Q, 'Answer': A.strip()})
            elif Q.startswith('A1') or Q.startswith('A2') or Q.startswith('Answer'):
                new_rows.append({'Title': row['Title'], 'DOI': row['DOI'], 'Authors': row['Authors'], 'Full_Text_URL': row['Full_Text_URL'], 'Question': A.strip(), 'Answer': Q})
        
        cleaned_data = pd.concat([cleaned_data, pd.DataFrame(new_rows)], ignore_index=True)

# Clean the data in the DataFrame
cleaned_data['Question'] = cleaned_data['Question'].str.replace(r'^[Question\d\.\:\- ]*', '', regex=True)
cleaned_data['Answer'] = cleaned_data['Answer'].str.replace(r'^[Answer\d\.\:\- ]*', '', regex=True)

# Write the cleaned data to the output CSV file
cleaned_data.to_csv('output_clean.csv', index=False)


 What is the Diagnostic Related Group (DRG) system?
Answer: The Diagnostic Related Group (DRG) system is a system that classifies hospital cases into identifiable groups in order to identify ‘products’ that a hospital provides. This model of classification is based on the International Classification of Diseases (ICD) and the presence of complications or co-morbidity. 

4. What countries has the DRG system spread to?
Answer: The DRG system has spread to countries such as Australia, Denmark, France, Germany, Austria, the Netherlands and Russia; followed by a less rapid spread from acute care into long-term care and psychiatry.
 

Q1: What was the main aim of PbR?
A1: The main aim of PbR was to improve efficiency and increase value for money in the NHS by enhancing service quality, facilitating patient choice, enabling service innovation, improving quality of service and reducing waiting times by rewarding healthcare providers for the volumes of work completed.

Q2: When did the first NH

In [14]:
df = pd.read_csv('output_clean.csv')
df.sample(1)

Unnamed: 0,Title,DOI,Authors,Full_Text_URL,Question,Answer
5,Has incentive payment improved venous thrombo-...,10.12688/f1000research.2-41.v1,"[{'Child', 'Sue'}, {'Sheaff', 'Rod'}, {'Boiko'...",https://f1000research.com/extapi/article/xml?d...,Why was CQUIN focused on VTE?,CQUIN was focused on VTE because there are lar...
