## Malawi Public Health Systems LLM Challenge
Building an AI assistant capable of providing knowledge contained in the Malawi Technical Guidelines for Integrated Disease Surveillance and Response (TGs for IDSR)

### Load the necessary packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
tg_booklet1_df = pd.read_excel('../data/MWTGBookletsExcel/TG_Booklet_1.xlsx')
tg_booklet2_df = pd.read_excel('../data/MWTGBookletsExcel/TG_Booklet_2.xlsx')
tg_booklet3_df = pd.read_excel('../data/MWTGBookletsExcel/TG_Booklet_3.xlsx')
tg_booklet4_df = pd.read_excel('../data/MWTGBookletsExcel/TG_Booklet_4.xlsx')
tg_booklet5_df = pd.read_excel('../data/MWTGBookletsExcel/TG_Booklet_5.xlsx')
tg_booklet6_df = pd.read_excel('../data/MWTGBookletsExcel/TG_Booklet_6.xlsx')

In [3]:
tg_booklet1_df.head()

Unnamed: 0,1,Technical Guidelines for﷐INTEGRATED DISEASE SURVEILLANCE AND RESPONSE FOR MALAWI
0,2,THIRD EDITION
1,3,BOOKLET ONE: INTRODUCTION SECTION
2,4,￼
3,5,DECEMBER 2020
4,6,￼ ...


In [4]:
train_df = pd.read_csv('./Train.csv')

In [5]:
train_df.shape

(748, 6)

In [6]:
# List column names
train_df.columns

Index(['ID', 'Question Text', 'Question Answer', 'Reference Document',
       'Paragraph(s) Number', 'Keywords'],
      dtype='object')

In [7]:
train_df.head()

Unnamed: 0,ID,Question Text,Question Answer,Reference Document,Paragraph(s) Number,Keywords
0,Q829,Compare the laboratory confirmation methods fo...,Chikungunya is confirmed using serological tes...,TG Booklet 6,"154, 166",Laboratory Confirmation For Chikungunya Vs. Di...
1,Q721,When should specimens be collected for Anthrax...,Specimens should be collected during the vesic...,TG Booklet 6,140,"Anthrax Specimen Collection: Timing, Preparati..."
2,Q464,Which key information should be recorded durin...,"During a register review, key information abou...",TG Booklet 3,439-440,"Register Review, Key Information, Suspected Ca..."
3,Q449,Why is the District log of suspected outbreaks...,The log includes information about response ac...,TG Booklet 3,412,"District Log, Response Activities, Steps Taken..."
4,Q6,What do Community based surveillance strategie...,Community-based surveillance strategies focus ...,TG Booklet 1,86,"Community-based Surveillance Strategies, Ident..."


In [8]:
# Rename the Question Text column to Instruction and Question Answer column to Response
train_df.rename(columns={'Question Text':'Instruction','Question Answer':'Response'}, inplace=True)

In [9]:
# Save train_df to a new csv file
train_df.to_csv('Train_.csv', index=False)

In [12]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='Train_.csv')

In [13]:
dataset

In [None]:
# Iterate through the dataset and print the first 5 rows
for i in range(5):
    print(dataset['train'][i])

In [36]:
alpaca_prompt = """Below is a question posed by healthcare professionals
including nurses, doctors, and researchers in Malawi,
all of whom actively engage in disease surveillance efforts.
Offer a response that is both accurate and concise, incorporating
relevant keywords to address the inquiry effectively.

### Question:
{}

### Answer:
{}

### Keywords:
{}
"""


def formatting_prompts_func(examples):
    questions = examples["Instruction"]
    answers = examples["Response"]
    keywords = examples["Keywords"]
    texts = []
    for question, answer, keyword in zip(questions, answers, keywords):
        text = alpaca_prompt.format(question, answer, keyword)
        texts.append(text)
    return { "text" : texts}

In [38]:
import collections.abc
instruction_dataset_ = [formatting_prompts_func(item) for item in instruction_dataset if isinstance(item, collections.abc.Iterable)]

In [39]:
instruction_dataset_ = [formatting_prompts_func(item) for item in instruction_dataset]

In [10]:
# Load the test dataset
test_df = pd.read_csv('../Test.csv')

In [14]:
test_df.head()

Unnamed: 0,ID,Question Text
0,Q4,"What is the definition of ""unusual event"""
1,Q5,What is Community Based Surveillance (CBS)?
2,Q9,What kind of training should members of VHC re...
3,Q10,What is indicator based surveillance (IBS)?
4,Q13,What is Case based surveillance?


In [15]:
test_df.shape

(499, 2)

In [12]:
# Create a Submission dataframe that contains the following columns: ID, and Target
submission_df = pd.DataFrame(columns=['ID', 'Target'])

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is the definition of 'unusual event'", # Instruction
        "", # Response
        "", # Reference Document
        "", # Paragraph(s) Number
        "", # Keywords
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
# Iterate through the test_df and take the Question Text and make inference to the model defined in the above cell and set the instruction to the Question Text and from the model's response pick the Keywords and set the entry in the ID column as follows Q4_keywords with the value being the keywords, paragraph(s) number as Q4_paragraph(s)_number with the value from model output, Response as Q4_question_answer and the Reference Document as Q4_reference_document
for index, row in test_df.iterrows():
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                row['Question Text'], # Instruction
                "", # Response
                "", # Reference Document
                "", # Paragraph(s) Number
                "", # Keywords
            )
        ]*1, return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    response = tokenizer.batch_decode(outputs)
    # Regex pattern
    pattern = r'### Response:\n(?P<Response>.*?)\n\n### Reference Document:\n(?P<Reference_Document>.*?)\n\n### Paragraph\(s\) Number:\n(?P<Paragraphs_Number>.*?)\n\n### Keywords:\n(?P<Keywords>.*?)\n\n'

    # Find all matches
    matches = re.findall(pattern, response, re.DOTALL)

    # Extracted fields
    for match in matches[1:]:  # Start from the second response
        response = match[0].strip()
        reference_document = match[1].strip()
        paragraphs_number = match[2].strip()
        keywords = match[3].strip()

        print("Response:", response)
        print("Reference Document:", reference_document)
        print("Paragraph(s) Number:", paragraphs_number)
        print("Keywords:", keywords)
        print()
    # Append the response, reference_document, paragraphs_number, keywords to the submission_df in the row entries i.e Q4_question_answer, Q4_keywords, Q4_paragraph(s)_number, Q4_reference_document
    submission_df.loc[index, f"{row['ID']}_question_answer"] = response
    submission_df.loc[index, f"{row['ID']}_keywords"] = keywords
    submission_df.loc[index, f"{row['ID']}_paragraph(s)_number"] = paragraphs_number
    submission_df.loc[index, f"{row['ID']}_reference_document"] = reference_document

    

In [16]:
sample_submission_df = pd.read_csv('../SampleSubmission.csv')

In [17]:
sample_submission_df.shape

(1996, 2)

In [44]:
submission1_df = pd.read_csv('../EDA/submission1.csv')

In [45]:
submission1_df.head()

Unnamed: 0,ID,Target,Q4_question_answer,Q4_keywords,Q4_paragraph(s)_number,Q4_reference_document,Q5_question_answer,Q5_keywords,Q5_paragraph(s)_number,Q5_reference_document,Q9_question_answer,Q9_keywords,Q9_paragraph(s)_number,Q9_reference_document,Q10_question_answer,Q10_keywords,Q10_paragraph(s)_number,Q10_reference_document
0,,,An unusual event is an occurrence that is unex...,"Unusual Event, Unexpected, Uncommon, Noteworth...",106.0,TG Booklet 1,,,,,,,,,,,,
1,,,,,,,Community Based Surveillance (CBS) is a strate...,"Community Based Surveillance (CBS), Early Dete...",761.0,TG Booklet 2,,,,,,,,
2,,,,,,,,,,,Members of VHC should receive training on the ...,"VHC Training, Role, Reporting, Reporting Form",369.0,TG Booklet 3,,,,
3,,,,,,,,,,,,,,,Indicator-based surveillance (IBS) is a type o...,"Indicator-based Surveillance, Specific Indicat...",761.0,TG Booklet 2
4,Q4_question_answer,An unusual event is an occurrence that is unex...,,,,,,,,,,,,,,,,


In [46]:
# remove the first 4 rows from the submission1_df
submission1_df = submission1_df.iloc[4:]

In [47]:
submission1_df.head()

Unnamed: 0,ID,Target,Q4_question_answer,Q4_keywords,Q4_paragraph(s)_number,Q4_reference_document,Q5_question_answer,Q5_keywords,Q5_paragraph(s)_number,Q5_reference_document,Q9_question_answer,Q9_keywords,Q9_paragraph(s)_number,Q9_reference_document,Q10_question_answer,Q10_keywords,Q10_paragraph(s)_number,Q10_reference_document
4,Q4_question_answer,An unusual event is an occurrence that is unex...,,,,,,,,,,,,,,,,
5,Q4_keywords,"Unusual Event, Unexpected, Uncommon, Noteworth...",,,,,,,,,,,,,,,,
6,Q4_paragraph(s)_number,106,,,,,,,,,,,,,,,,
7,Q4_reference_document,TG Booklet 1,,,,,,,,,,,,,,,,
8,Q5_question_answer,Community Based Surveillance (CBS) is a strate...,,,,,,,,,,,,,,,,


In [48]:
# Remove any other columns apart from ID and Target
submission1_df = submission1_df[['ID', 'Target']]

In [49]:
submission1_df.head()

Unnamed: 0,ID,Target
4,Q4_question_answer,An unusual event is an occurrence that is unex...
5,Q4_keywords,"Unusual Event, Unexpected, Uncommon, Noteworth..."
6,Q4_paragraph(s)_number,106
7,Q4_reference_document,TG Booklet 1
8,Q5_question_answer,Community Based Surveillance (CBS) is a strate...


In [50]:
# Compare the submission1_df and sample_submission_df and add rows that are in sample_submission_df but not in submission1_df to submission1_df
submission1_df_ = pd.concat([submission1_df, sample_submission_df[~sample_submission_df['ID'].isin(submission1_df['ID'])]])

In [51]:
submission1_df_.shape

(1996, 2)

In [52]:
submission1_df_.head()

Unnamed: 0,ID,Target
4,Q4_question_answer,An unusual event is an occurrence that is unex...
5,Q4_keywords,"Unusual Event, Unexpected, Uncommon, Noteworth..."
6,Q4_paragraph(s)_number,106
7,Q4_reference_document,TG Booklet 1
8,Q5_question_answer,Community Based Surveillance (CBS) is a strate...


In [53]:
submission1_df_.tail()

Unnamed: 0,ID,Target
1987,Q998_reference_document,
1988,Q999_keywords,
1989,Q999_paragraph(s)_number,
1990,Q999_question_answer,
1991,Q999_reference_document,


In [42]:
# Set the Target column to an empty string '' where the IDs are Q66_paragraph(s)_number, and Q999_paragraph(s)_number
submission1_df_.loc[submission1_df_['ID'].str.contains('Q66_paragraph(s)_number'), 'Target'] = '169-176'
submission1_df_.loc[submission1_df_['ID'].str.contains('Q999_paragraph(s)_number'), 'Target'] = '169-176'

  submission1_df_.loc[submission1_df_['ID'].str.contains('Q66_paragraph(s)_number'), 'Target'] = '169-176'
  submission1_df_.loc[submission1_df_['ID'].str.contains('Q999_paragraph(s)_number'), 'Target'] = '169-176'


In [40]:
# Set the Target column for the ID Q66_paragraph(s)_number to the value similar to Q999_paragraph(s)_number
submission1_df_.loc[submission1_df_[submission1_df_['ID'] == 'Q999_paragraph(s)_number'].index, 'Target'] = submission1_df_[submission1_df_['ID'] == 'Q66_paragraph(s)_number']['Target'].values[0]

  submission1_df_[submission1_df_['ID'].str.contains('Q66_paragraph(s)_number')]
  submission1_df_[submission1_df_['ID'].str.contains('Q999_paragraph(s)_number')]


Unnamed: 0,ID,Target


In [55]:
# Set the Target column for the Q66_paragraph(s)_number in submission1_df_ to the value similar to Q66_paragraph(s)_number in sample_submission_df
submission1_df_.loc[submission1_df_[submission1_df_['ID'] == 'Q66_paragraph(s)_number'].index, 'Target'] = sample_submission_df[sample_submission_df['ID'] == 'Q66_paragraph(s)_number']['Target'].values[0]

In [57]:
# Look for Target values with [] and replace them with the value in the Target column with ID Q66_paragraph(s)_number
submission1_df_.loc[submission1_df_['Target'] == '[]', 'Target'] = submission1_df_[submission1_df_['ID'] == 'Q66_paragraph(s)_number']['Target'].values[0]

In [58]:
# Check for any Target values with []
submission1_df_[submission1_df_['Target'] == '[]']

Unnamed: 0,ID,Target


In [59]:
submission1_df_.to_csv('submission.csv', index=False)

In [39]:
# Check for duplicates in the submission1_df_
submission1_df_[submission1_df_.duplicated(subset='ID')]

Unnamed: 0,ID,Target


In [60]:
submission2_df = pd.read_csv('../EDA/submission2.csv')

In [61]:
submission2_df.head()

Unnamed: 0,ID,Target
0,Q4_question_answer,An unusual event is an occurrence that is unex...
1,Q4_keywords,"Unusual Event, Unexpected, Uncommon, Noteworth..."
2,Q4_paragraph(s)_number,106
3,Q4_reference_document,TG Booklet 1
4,Q5_question_answer,Community Based Surveillance (CBS) is a strate...


In [62]:
submission2_df.shape

(940, 2)

In [67]:
submission2_df['Target'].unique()

array(['An unusual event is an occurrence that is unexpected, uncommon, or noteworthy, and may indicate a potential public health concern.',
       'Unusual Event, Unexpected, Uncommon, Noteworthy, Potential Public Health Concern',
       '106', 'TG Booklet 1',
       'Community Based Surveillance (CBS) is a strategy that involves the community in the early detection and response to public health events.',
       'Community Based Surveillance (CBS), Early Detection, Public Health Events, Community Involvement',
       '761', 'TG Booklet 2',
       'Members of VHC should receive training on the role of VHC, the importance of reporting, and the use of the reporting form.',
       'VHC Training, Role, Reporting, Reporting Form', '369',
       'TG Booklet 3',
       'Indicator-based surveillance (IBS) is a type of surveillance that focuses on specific indicators or events of public health importance. It involves monitoring the occurrence of these indicators or events over time to assess th

In [66]:
# Replace all 2022_Value with "I don't know" with 0.0
submission2_df['Target'] = submission2_df['Target'].replace('[]', '')

In [68]:
# Compare sample_submission_df and submission2_df and add rows that are in sample_submission_df but not in submission2_df to submission2_df
for index, row in sample_submission_df.iterrows():
    if row['ID'] not in submission2_df['ID'].values:
        submission2_df = submission2_df.append(row, ignore_index=True)

  submission2_df = submission2_df.append(row, ignore_index=True)


In [69]:
submission2_df.shape

(1996, 2)

In [70]:
submission2_df.to_csv('submission2_refined.csv', index=False)

In [73]:
# Return entries whose Target values are ''
submission2_df[submission2_df['Target'] == '']

Unnamed: 0,ID,Target


In [72]:
# Replace the Target values '' with 'keywords'
submission2_df.loc[submission2_df['Target'] == '', 'Target'] = 'keywords'

In [74]:
submission2_df.to_csv('submission3_refined.csv', index=False)

In [75]:
# Check for row with ID Q66_paragraph(s)_number
submission2_df[submission2_df['ID'] == 'Q66_paragraph(s)_number']

Unnamed: 0,ID,Target
114,Q66_paragraph(s)_number,


In [76]:
# Set its value to 169-176
submission2_df.loc[submission2_df['ID'] == 'Q66_paragraph(s)_number', 'Target'] = '169-176'

In [79]:
# Check for entries with NaN values
submission2_df[submission2_df.isna().any(axis=1)]

Unnamed: 0,ID,Target


In [78]:
# Replace Nan values with 'values'
submission2_df.fillna('values', inplace=True)

In [None]:
submission2_df.to_csv('submission4_refined.csv', index=False)

In [7]:
sample_sub_df = pd.read_csv('../Test.csv')

In [8]:
sample_sub_df.head()

Unnamed: 0,ID,Question Text
0,Q4,"What is the definition of ""unusual event"""
1,Q5,What is Community Based Surveillance (CBS)?
2,Q9,What kind of training should members of VHC re...
3,Q10,What is indicator based surveillance (IBS)?
4,Q13,What is Case based surveillance?


In [22]:
# using the ID values in the ID column and create a list that contains value of the ID concatenated with keywords i.e. Q4_keywords
ids = sample_sub_df['ID'].values
ids = [f"{id}_keywords" for id in ids]

In [23]:
import pandas as pd
submission5_df = pd.read_csv('../EDA/cpu_submission1.csv')

In [24]:
submission5_df.head()

Unnamed: 0,ID,Target
0,Q4_keywords,"Concern, Definition, Routine, Event, Cluster, ..."
1,Q4_paragraph(s)_number,434
2,Q4_question_answer,An unusual event is defined as any event that ...
3,Q4_reference_document,TG Booklet 1
4,Q5_keywords,"Cases, Providing, Feedback, Public, Community,..."


In [25]:
submission5_df.shape

(1996, 2)

In [11]:
# Remove duplicates from the submission5_df
submission5_df = submission5_df.drop_duplicates(subset=['ID'])

In [13]:
# Check for entries with NaN values
submission5_df[submission5_df.isna().any(axis=1)]

Unnamed: 0,ID,Target


In [7]:
# Count for the number of NaN values in the Target column
submission5_df['Target'].isna().sum()

73

In [14]:
# Drop the columns with NaN values
submission5_df = submission5_df.dropna()

In [10]:
# Remove any white space before the start of the values in the Target column
submission5_df['Target'] = submission5_df['Target'].str.strip()

In [26]:
# Load submission12_refined and replace the NaN values in submission5_df with the values in submission12_refined
submission12_refined = pd.read_csv('../EDA/submission26_refined2.csv')
# submission5_df['Target'] = submission5_df['Target'].fillna(submission12_refined['Target'])

In [29]:
# Create a copy of the dataframe to avoid fragmentation
submission5_df_copy = submission5_df.copy()

# Perform the replacement operation
for id in ids:
    submission5_df_copy.loc[submission5_df_copy['ID'] == id, 'Target'] = submission12_refined.set_index('ID')['Target'].get(id, id)


In [30]:
submission5_df_copy.head()

Unnamed: 0,ID,Target
0,Q4_keywords,"Unusual Event, Definition, Occurrence, Locatio..."
1,Q4_paragraph(s)_number,434
2,Q4_question_answer,An unusual event is defined as any event that ...
3,Q4_reference_document,TG Booklet 1
4,Q5_keywords,"Community-based Surveillance (CBS), Community ..."


In [31]:
submission5_df_copy.to_csv('cpu_submission_refined__.csv', index=False)

In [15]:
# Check for Target values with [] and replace them with the correspondent Target value in submission12_refined
submission5_df.loc[submission5_df['Target'] == '[]', 'Target'] = submission12_refined[submission5_df['Target'] == '[]']['Target']

  submission5_df.loc[submission5_df['Target'] == '[]', 'Target'] = submission12_refined[submission5_df['Target'] == '[]']['Target']


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [15]:
# Return entries with the ID value having Q{Number}_keywords in both submission5_df and submission12_refined and replace the Target value in submission5_df with the Target value in submission12_refined
submission5_df.loc[submission5_df['ID'].str.contains('Q\d+_keywords'), 'Target'] = submission12_refined[submission5_df['ID'].str.contains('Q\d+_keywords')]['Target']

In [10]:
# Add the entries in submission12_refined that are not in submission5_df to submission5_df
for index, row in submission12_refined.iterrows():
    if row['ID'] not in submission5_df['ID'].values:
        submission5_df = submission5_df.append(row, ignore_index=True)

  submission5_df = submission5_df.append(row, ignore_index=True)


In [69]:
# Replace the NaN values with 'values'
submission5_df.fillna('target', inplace=True)

In [24]:
# Check for Target values in submission5_df that differ from submission12_refined and replace the values in submission5_df with the values in submission12_refined
submission5_df[submission5_df['Target'] != submission12_refined['Target']]
submission5_df.loc[submission5_df['Target'] != submission12_refined['Target'], 'Target'] = submission12_refined[submission5_df['Target'] != submission12_refined['Target']]['Target']

In [21]:
submission5_df.to_csv('cpu_submission2_refined.csv', index=False)

In [None]:
# Cre

In [3]:
submission_5_refined_df = pd.read_csv('../EDA/submission26_refined2.csv')

In [18]:
# Compare submission5_df and submission_5_refined_df and replace the different rows in submission5_df with the rows in submission_5_refined_df
submission5_df = submission_5_refined_df if submission5_df.equals(submission_5_refined_df) else submission5_df

In [16]:
submission_21_df = pd.read_csv('../EDA/submission21_refined.csv')
submission_19_df = pd.read_csv('../EDA/submission19_refined.csv')