In [None]:
import os
import openai
import pandas as pd

# Set up your OpenAI API key
api_key = os.environ['OPENAI_API_KEY']
openai.api_key = api_key

# Function to generate predictions using GPT-4 Chat API
def generate_predictions(content):
    completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": content+ "Given a news article, please extract relevant events in the form of dictionaries. Each event should include keys for 'Disease', 'Location,' 'Incident' (either 'case' or 'death'), 'Incident_Type' (either 'new' or 'total'), and 'Number.' If the 'Disease' key is not present in an event, do not include the event in the result. Additionally, please make sure that no duplicate events are included in the list. Provide the extracted events as a list of dictionaries. If no events are extracted, the result should be an empty list."}]
    )
    return completion.choices[0].message['content']

# Read your DataFrame
data = pd.read_csv('output.csv')

# Open a text file for writing predictions
with open('event.txt', 'w') as txt_file:
    # Iterate over rows and generate predictions
    for index, row in data.iterrows():
        trans_article = row['article']
        if pd.notna(trans_article):  # Skip NaN values
            prediction = generate_predictions(trans_article)
            data.at[index, 'predicted_label'] = prediction
            # Write the prediction to the text file
            txt_file.write(prediction)
        txt_file.write('\n')    

# Save the modified DataFrame to a new CSV file
data.to_csv('Event_Extraction_Chatgpt.csv', index=False)

In [14]:
import pandas as pd
data =pd.read_csv('Event_Extraction_Chatgpt.csv')
type(data['true_label'][1])

str

In [15]:
data

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,,[],,
1,Fire broke out during gas cylinder refilling i...,[],,The given text does not contain information ab...
2,Increasing cases of lumpy skin disease in Utta...,"[{'Disease': 'Lumpy skin disease', 'Location':...",,"[\n{\n""Disease"": ""lumpy skin disease"",\n""Locat..."
3,"In Jharkhand's Kasturba School, 28 girls fell ...",[],,"[{""Disease"": ""food poisoning"", ""Location"": ""Ka..."
4,Aurangabad news : In Aurangabad in two days he...,[],,"[\n{\n'Disease': 'Heat stroke',\n'Location': '..."
...,...,...,...,...
699,Mumbai Terror Attacks: 14 years of carnage.. C...,[],,The provided news article does not include any...
700,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[\n{""Disease"": ""Corona"", ""Location"": ""Telangan..."
701,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],,"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ..."
702,8 laborers died when the truck overturned. Bih...,[],,[]


In [20]:
sa = data
sa=pd.read_csv('Event_Extraction_Chatgpt.csv')

sa['predicted_label'] = sa['predicted_label'].apply(lambda x: str([]) if pd.isna(x) else x)

# Using fillna to replace remaining NaN with an empty list
sa['predicted_label'].fillna(str([]), inplace=True)

In [3]:
#Evaluation Metrics for Event Extraction.
import pandas as pd
from scipy.optimize import linear_sum_assignment
data = pd.read_csv('Event_Extraction_Chatgpt.csv')
N = len(data)

def optimal_cost(cost_matrix):
    '''Return the cost of optimal matching by using the bipartite matching algorithm (also known as assingment problem). Objective is to maximize the cost'''
    row_ind, col_ind = linear_sum_assignment(cost_matrix,maximize = True)
    return cost_matrix[row_ind, col_ind].sum()

def isinside(a,b):
    if a.find(b)!=-1 or b.find(a)!=-1:
        return 1
    else:
        return 0

def Soft_match(a,b,key_list):
    '''Soft match is the fraction of the number of keys in a predicted event matched with the GT event out of the total keys'''
    score = 0
    for key in key_list:
        if a[key]==None or b[key]==None or len(str(a[key]))==0 or len(str(b[key]))==0:
            if a[key]==b[key]:
                score+=1
        else:
            if isinside(str(a[key]).replace(",","").lower(),str(b[key]).replace(",","").lower())==1:
                score+=1
             
    return score/len(key_list)


def Hard_match(a,b,key_list):
    '''Hard-match returns 1 if all the keys match between predicted and gt events, else return 0'''
    for key in key_list:
        #print("a[key]",a[key])
        #print("b[key]",b[key])
        if a[key]==None or b[key]==None or len(str(a[key]))==0 or len(str(b[key]))==0:
            if a[key]!=b[key]:
                return 0
        else:
            if isinside(str(a[key]).replace(",","").lower(),str(b[key]).replace(",","").lower())==0:
                return 0
    return 1
def optimal_match_score(N,K,type="soft"):
    '''Master function to get score of optimal-matching between N GT evens and K predicted events.'''
    # print(N,K)
    if N==None or K==None or len(N)==0 or len(K)==0:
        if N==K:
            return 1
        else:
            return 0
    key_list = ["Disease","Location","Incident","Incident_Type","Number"]
    #{'Disease': 'Lumpy skin disease', '': 'Uttarakhand', 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '19404'}, {'Disease': 'Lumpy skin disease', 'Location': 'Uttarkhand', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'total', 'Number': '321'}]"
    cost_matrix = []
    for i in N:
        if type=="soft":
            temp = [Soft_match(i,j,key_list) for j in K]
        if type=="hard":
            temp = [Hard_match(i,j,key_list) for j in K]
        # print(temp)
        cost_matrix.append(temp)
    cost_matrix = np.array(cost_matrix)
    cost = optimal_cost(cost_matrix)
    return cost
# Evaluation Metric Precision, Recall and F1
def PRF1(score,true,pred):
    precision = score*(min(true,pred))/pred
    recall = score*(min(true,pred))/true
    F1 = (2*precision*recall)/(precision+recall)
    return precision,recall,F1

def PRF2(score,true,pred):
    '''Return precision, recall and F1. 
    1) precision = (number of true events)*score/(number of predicted events)
    2) Recall = score
    3) F1 = 2*P*R/(P+R)'''
    precision = score*(true)/pred
    recall = score
    F1 = (2*precision*recall)/(precision+recall)
    return precision,recall,F1

In [111]:
print((data.iloc[481,:])['true_label'])
print((data.iloc[481,:])['predicted_label'])


[{'Disease': 'H3N2', 'Location': "J'khand", 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '1'}, {'Disease': 'COVID-19', 'Location': "J'khand", 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '5'}]
[
  {
    "Disease": "H3N2",
    "Location": "J'khand",
    "Incident": "case",
    "Incident_Type": "new",
    "Number": 1
  },
  {
    "Disease": "COVID-19",
    "Location": "J'khand",
    "Incident": "case",
    "Incident_Type": "new",
    "Number": 5
  }
]


In [25]:
#we = data
we=pd.read_csv('Event_Extraction_Chatgpt.csv')

we['predicted_label'] = we['predicted_label'].apply(lambda x: str([]) if pd.isna(x) else x)

# Using fillna to replace remaining NaN with an empty list
we['predicted_label'].fillna(str([]), inplace=True)


In [41]:
import json
import ast
import numpy as np
total_score = 0
total_num_packets = 0
all_labels = []
total_packets_pred= 0
total_packets_true = 0
total_score_hard = 0
t_list = []
tr_list = []
ct = we['predicted_label'].isna().sum()
df_soft = pd.DataFrame(columns=['curr_score'])
df_hard = pd.DataFrame(columns=['curr_score_hard'])

print("ct",ct)
for i in range(1,N):
    print(i)
    
    
    modi = (we['true_label'][i].replace("Incident (case or death)","Incident").replace("Incident Type (new or total)","Incident_Type"))

    #if i == 113:
        #print("Original String:")
        #print(we['true_label'][i])

        #print("Modified String:")
        #print(modi)
    #True_labels = json.loads(modi)
    if(type(we['predicted_label'][i])== float):
        print(we['predicted_label'][i])
    predicted_label_value = (we['predicted_label'][i]).replace("\n","").replace("\'", "\"")
    
    if predicted_label_value.strip() and "The given text does not contain information about any disease, therefore it does not fit the required format for extraction." not in predicted_label_value:
        
        try:
            Pred_labels = json.loads(predicted_label_value)
            t = str(Pred_labels)
            t_list = ast.literal_eval(t)

            # Iterate through dictionaries in the list
            for j in range(len(t_list)):
                for key, value in t_list[j].items():
                    # Check if the value is a string
                    if isinstance(value, str):
                        # Replace single quote (') with an empty string
                        t_list[j][key] = value.replace("'", "") 
        except json.JSONDecodeError as e:
            Pred_labels = []
    else:
        Pred_labels = []

    # Replace Pred_labels with t_list
    Pred_labels = t_list
    tn = str(modi)
    tr_list = ast.literal_eval(tn)
    for j in range(len(tr_list)):
        for key, value in tr_list[j].items():
            if isinstance(value, str):
                tr_list[j][key] = value.replace("'", "")

    True_labels =  tr_list  
    #if(i==113):
        #print(True_labels)     
    curr_score = optimal_match_score(True_labels,Pred_labels,"soft")
    curr_score_hard = optimal_match_score(True_labels,Pred_labels,"hard")
    print("curr_score_",curr_score)
    print("curr_score_hard",curr_score_hard)
    total_packets_pred+=max(1,len(Pred_labels))
    total_packets_true+=max(1,len(True_labels))
    total_score +=curr_score
    total_score_hard+=curr_score_hard
    df_soft.loc[len(df_soft.index)] = [curr_score]

    # Append curr_score_hard to df_hard
    df_hard.loc[len(df_hard.index)] = [curr_score_hard]
df_soft.to_csv('soft_scores.csv', index=False)
df_hard.to_csv('hard_scores.csv', index=False)

    
cumulative_score = total_score/total_packets_true
cumulative_score_hard = total_score_hard/total_packets_true
p,r,f = PRF2(cumulative_score,total_packets_true,total_packets_pred)
print("Average Soft-match score: ",cumulative_score)
print("Soft Precision: ",p)
print("Soft Recall: ",r)
print("Soft F1: ",f)

print("----------------------")

p,r,f = PRF2(cumulative_score_hard,total_packets_true,total_packets_pred)
print("Average Hard-match score: ",cumulative_score_hard)
print("Hard Precision: ",p)
print("Hard Recall: ",r)
print("Hard F1: ",f)

ct 0
1
curr_score_ 1
curr_score_hard 1
2
curr_score_ 1.6
curr_score_hard 0
3
curr_score_ 0
curr_score_hard 0
4
curr_score_ 0
curr_score_hard 0
5
curr_score_ 0.6
curr_score_hard 0
6
curr_score_ 1.0
curr_score_hard 1
7
curr_score_ 0
curr_score_hard 0
8
curr_score_ 3.0
curr_score_hard 3
9
curr_score_ 2.0
curr_score_hard 2
10
curr_score_ 1.0
curr_score_hard 1
11
curr_score_ 1
curr_score_hard 1
12
curr_score_ 2.0
curr_score_hard 2
13
curr_score_ 1
curr_score_hard 1
14
curr_score_ 1.0
curr_score_hard 1
15
curr_score_ 1.0
curr_score_hard 1
16
curr_score_ 1.6
curr_score_hard 0
17
curr_score_ 2.8
curr_score_hard 2
18
curr_score_ 0.8
curr_score_hard 0
19
curr_score_ 0.8
curr_score_hard 0
20
curr_score_ 0.4
curr_score_hard 0
21
curr_score_ 2.8000000000000003
curr_score_hard 0
22
curr_score_ 0
curr_score_hard 0
23
curr_score_ 1
curr_score_hard 1
24
curr_score_ 0
curr_score_hard 0
25
curr_score_ 1.0
curr_score_hard 1
26
curr_score_ 2.4000000000000004
curr_score_hard 0
27
curr_score_ 0
curr_score_ha

In [42]:
df_soft

Unnamed: 0,curr_score
0,1.0
1,1.6
2,0.0
3,0.0
4,0.6
...,...
698,1.0
699,4.0
700,0.0
701,1.0


In [44]:
df_hard 


Unnamed: 0,curr_score_hard
0,1
1,0
2,0
3,0
4,0
...,...
698,1
699,4
700,0
701,1


In [77]:
soft=df_soft.iloc[:,0]
hard=df_hard.iloc[:,0]
#soft
hard

0      1
1      0
2      0
3      0
4      0
      ..
698    1
699    4
700    0
701    1
702    1
Name: curr_score_hard, Length: 703, dtype: int64

In [78]:
import pandas as pd

da = pd.read_csv('Event_Extraction_Chatgpt.csv')
da=da.drop(0)
da.to_csv('your_file.csv', index=False)
ga=pd.read_csv('your_file.csv')
#print(article_column)
new_data = pd.DataFrame({
    'article': ga['article'],
    'true_label': ga['true_label'],
    'predicted_label(Chat_gpt)': ga['predicted_label'],
    'Soft_score':soft,
    'Hard_score':hard,
    
})

new_data.to_csv('Soft_Hard_Scores_Chat_gpt.csv', index=False)




In [70]:
len(da)

703

In [79]:
dm=pd.read_csv('Soft_Hard_Scores_Chat_gpt.csv')
dm

Unnamed: 0,article,true_label,predicted_label(Chat_gpt),Soft_score,Hard_score
0,Fire broke out during gas cylinder refilling i...,[],The given text does not contain information ab...,1.0,1
1,Increasing cases of lumpy skin disease in Utta...,"[{'Disease': 'Lumpy skin disease', 'Location':...","[\n{\n""Disease"": ""lumpy skin disease"",\n""Locat...",1.6,0
2,"In Jharkhand's Kasturba School, 28 girls fell ...",[],"[{""Disease"": ""food poisoning"", ""Location"": ""Ka...",0.0,0
3,Aurangabad news : In Aurangabad in two days he...,[],"[\n{\n'Disease': 'Heat stroke',\n'Location': '...",0.0,0
4,Ambikapur News:Four people of the same family ...,"[{'Disease': 'Diarrhea', 'Location': 'Ambikapu...","[{'Disease': 'food poisoning', 'Location': 'Pa...",0.6,0
...,...,...,...,...,...
698,Mumbai Terror Attacks: 14 years of carnage.. C...,[],The provided news article does not include any...,1.0,1
699,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...","[\n{""Disease"": ""Corona"", ""Location"": ""Telangan...",4.0,4
700,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ...",0.0,0
701,8 laborers died when the truck overturned. Bih...,[],[],1.0,1


# Adding more data  


In [1]:
import pandas as pd
ev=pd.read_csv('Event_1.csv')

In [2]:
ev

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,"Bus falls off bridge in Jharkhand; 6 killed, s...",[],,The provided news article does not include any...
1,"11 killed, 10 injured as jeep rams into tracto...",[],,[]
2,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...",,"[\n {\n ""Disease"": ""Covid-19"",\n ""Locat..."
3,,[],,
4,"On Children's Day-eve, top specialist says con...",[],,"[{'Disease': 'congenital heart disease', 'Loca..."
5,Indore Reports 10 Vector Borne Diseases In A W...,"[{'Disease': 'Dengue', 'Location': 'Indore', '...",,"[{\n'Disease': 'dengue and malaria', \n'Locati..."
6,India News | No Serious Health Problems from B...,[],,The given news article does not provide inform...
7,"NYC blaze kills 4, including baby | Urban Lege...",[],,[]
8,"India sees 30% rise in Covid tally with 2,786 ...","[{'Disease': 'Covid ', 'Location': 'India', 'I...",,"[\n{\n""Disease"": ""COVID-19"",\n""Location"": ""Ind..."
9,India News | Uttar Pradesh CM Condoles Deaths ...,[],,[]


In [3]:
ev1=pd.read_csv('Event_Extraction_Chatgpt.csv')
ev1

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,,[],,
1,Fire broke out during gas cylinder refilling i...,[],,The given text does not contain information ab...
2,Increasing cases of lumpy skin disease in Utta...,"[{'Disease': 'Lumpy skin disease', 'Location':...",,"[\n{\n""Disease"": ""lumpy skin disease"",\n""Locat..."
3,"In Jharkhand's Kasturba School, 28 girls fell ...",[],,"[{""Disease"": ""food poisoning"", ""Location"": ""Ka..."
4,Aurangabad news : In Aurangabad in two days he...,[],,"[\n{\n'Disease': 'Heat stroke',\n'Location': '..."
...,...,...,...,...
699,Mumbai Terror Attacks: 14 years of carnage.. C...,[],,The provided news article does not include any...
700,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[\n{""Disease"": ""Corona"", ""Location"": ""Telangan..."
701,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],,"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ..."
702,8 laborers died when the truck overturned. Bih...,[],,[]


In [4]:
ev.columns

Index(['article', 'true_label', 'cleaned_true_labels', 'predicted_label'], dtype='object')

In [5]:
ev1.columns

Index(['article', 'true_label', 'cleaned_true_labels', 'predicted_label'], dtype='object')

In [6]:
result=pd.concat([ev,ev1],axis=0)

In [7]:
result.columns

Index(['article', 'true_label', 'cleaned_true_labels', 'predicted_label'], dtype='object')

In [8]:
result.shape

(753, 4)

In [9]:
result

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,"Bus falls off bridge in Jharkhand; 6 killed, s...",[],,The provided news article does not include any...
1,"11 killed, 10 injured as jeep rams into tracto...",[],,[]
2,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...",,"[\n {\n ""Disease"": ""Covid-19"",\n ""Locat..."
3,,[],,
4,"On Children's Day-eve, top specialist says con...",[],,"[{'Disease': 'congenital heart disease', 'Loca..."
...,...,...,...,...
699,Mumbai Terror Attacks: 14 years of carnage.. C...,[],,The provided news article does not include any...
700,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[\n{""Disease"": ""Corona"", ""Location"": ""Telangan..."
701,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],,"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ..."
702,8 laborers died when the truck overturned. Bih...,[],,[]


we only want to consider rows which contain true_label

In [10]:
df_fil= result[result['true_label'] != '[]']

In [11]:
df_fil

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
2,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...",,"[\n {\n ""Disease"": ""Covid-19"",\n ""Locat..."
5,Indore Reports 10 Vector Borne Diseases In A W...,"[{'Disease': 'Dengue', 'Location': 'Indore', '...",,"[{\n'Disease': 'dengue and malaria', \n'Locati..."
8,"India sees 30% rise in Covid tally with 2,786 ...","[{'Disease': 'Covid ', 'Location': 'India', 'I...",,"[\n{\n""Disease"": ""COVID-19"",\n""Location"": ""Ind..."
15,"85 New Covid Cases Reported In City, Dist | Va...","[{'Disease': 'Covid', 'Location': 'Vadodara', ...",,"[{'Disease': 'Covid-19', 'Location': 'Vadodara..."
16,COVID-19 cases rise to 98 due to institutional...,"[{'Disease': 'COVID-19', 'Location': 'Chennai ...",,"[\n {\n ""Disease"": ""COVID-19"",\n ..."
...,...,...,...,...
693,Two more foreigners found Covid-19 positive in...,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay...",,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay..."
694,corona cases in india today: Increased corona ...,"[{'Disease': 'Corona', 'Location': 'Kerela', '...",,"[\n {\n 'Disease': 'Covid19',\n 'Locati..."
695,telangana corona cases: TS: New cases approach...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[{\n""Disease"": ""Corona"",\n""Location"": ""Telanga..."
696,A thousand cases increased in 24 hours Prajasa...,"[{'Disease': 'Corona', 'Location': 'India', 'I...",,"[{'Disease': 'Corona', 'Location': 'New Delhi'..."


In [12]:
df_fil.reset_index(drop=True, inplace=True)

In [13]:
df_fil

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...",,"[\n {\n ""Disease"": ""Covid-19"",\n ""Locat..."
1,Indore Reports 10 Vector Borne Diseases In A W...,"[{'Disease': 'Dengue', 'Location': 'Indore', '...",,"[{\n'Disease': 'dengue and malaria', \n'Locati..."
2,"India sees 30% rise in Covid tally with 2,786 ...","[{'Disease': 'Covid ', 'Location': 'India', 'I...",,"[\n{\n""Disease"": ""COVID-19"",\n""Location"": ""Ind..."
3,"85 New Covid Cases Reported In City, Dist | Va...","[{'Disease': 'Covid', 'Location': 'Vadodara', ...",,"[{'Disease': 'Covid-19', 'Location': 'Vadodara..."
4,COVID-19 cases rise to 98 due to institutional...,"[{'Disease': 'COVID-19', 'Location': 'Chennai ...",,"[\n {\n ""Disease"": ""COVID-19"",\n ..."
...,...,...,...,...
386,Two more foreigners found Covid-19 positive in...,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay...",,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay..."
387,corona cases in india today: Increased corona ...,"[{'Disease': 'Corona', 'Location': 'Kerela', '...",,"[\n {\n 'Disease': 'Covid19',\n 'Locati..."
388,telangana corona cases: TS: New cases approach...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[{\n""Disease"": ""Corona"",\n""Location"": ""Telanga..."
389,A thousand cases increased in 24 hours Prajasa...,"[{'Disease': 'Corona', 'Location': 'India', 'I...",,"[{'Disease': 'Corona', 'Location': 'New Delhi'..."


In [14]:
df_fil['predicted_label'] = df_fil['predicted_label'].apply(lambda x: str([]) if pd.isna(x) else x)

# Using fillna to replace remaining NaN with an empty list
df_fil['predicted_label'].fillna(str([]), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fil['predicted_label'] = df_fil['predicted_label'].apply(lambda x: str([]) if pd.isna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fil['predicted_label'].fillna(str([]), inplace=True)


In [15]:
df_fil.shape

(391, 4)

In [15]:
#Evaluation Metrics for Event Extraction.
import pandas as pd
from scipy.optimize import linear_sum_assignment
#data = pd.read_csv('Event_Extraction_Chatgpt.csv')
N = len(df_fil)

def optimal_cost(cost_matrix):
    '''Return the cost of optimal matching by using the bipartite matching algorithm (also known as assingment problem). Objective is to maximize the cost'''
    row_ind, col_ind = linear_sum_assignment(cost_matrix,maximize = True)
    return cost_matrix[row_ind, col_ind].sum()

def isinside(a,b):
    if a.find(b)!=-1 or b.find(a)!=-1:
        return 1
    else:
        return 0

def Soft_match(a,b,key_list):
    '''Soft match is the fraction of the number of keys in a predicted event matched with the GT event out of the total keys'''
    score = 0
    for key in key_list:
        if a[key]==None or b[key]==None or len(str(a[key]))==0 or len(str(b[key]))==0:
            if a[key]==b[key]:
                score+=1
        else:
            if isinside(str(a[key]).replace(",","").lower(),str(b[key]).replace(",","").lower())==1:
                score+=1
             
    return score/len(key_list)


def Hard_match(a,b,key_list):
    '''Hard-match returns 1 if all the keys match between predicted and gt events, else return 0'''
    for key in key_list:
        #print("a[key]",a[key])
        #print("b[key]",b[key])
        if a[key]==None or b[key]==None or len(str(a[key]))==0 or len(str(b[key]))==0:
            if a[key]!=b[key]:
                return 0
        else:
            if isinside(str(a[key]).replace(",","").lower(),str(b[key]).replace(",","").lower())==0:
                return 0
    return 1
def optimal_match_score(N,K,type="soft"):
    '''Master function to get score of optimal-matching between N GT evens and K predicted events.'''
    # print(N,K)
    if N==None or K==None or len(N)==0 or len(K)==0:
        if N==K:
            return 1
        else:
            return 0
    key_list = ["Disease","Location","Incident","Incident_Type","Number"]
    #{'Disease': 'Lumpy skin disease', '': 'Uttarakhand', 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '19404'}, {'Disease': 'Lumpy skin disease', 'Location': 'Uttarkhand', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'total', 'Number': '321'}]"
    cost_matrix = []
    for i in N:
        if type=="soft":
            temp = [Soft_match(i,j,key_list) for j in K]
        if type=="hard":
            temp = [Hard_match(i,j,key_list) for j in K]
        # print(temp)
        cost_matrix.append(temp)
    cost_matrix = np.array(cost_matrix)
    cost = optimal_cost(cost_matrix)
    return cost
# Evaluation Metric Precision, Recall and F1
def PRF1(score,true,pred):
    precision = score*(min(true,pred))/pred
    recall = score*(min(true,pred))/true
    F1 = (2*precision*recall)/(precision+recall)
    return precision,recall,F1

def PRF2(score,true,pred):
    '''Return precision, recall and F1. 
    1) precision = (number of true events)*score/(number of predicted events)
    2) Recall = score
    3) F1 = 2*P*R/(P+R)'''
    precision = score*(true)/pred
    recall = score
    F1 = (2*precision*recall)/(precision+recall)
    return precision,recall,F1

In [17]:
import json
import ast
import numpy as np
import pandas as pd



total_score = 0
total_num_packets = 0
all_labels = []
total_packets_pred = 0
total_packets_true = 0
total_score_hard = 0
t_list = []
tr_list = []
ct = df_fil['predicted_label'].isna().sum()
df_soft = pd.DataFrame(columns=['curr_score'])
df_hard = pd.DataFrame(columns=['curr_score_hard'])
N = len(df_fil)
city_df=pd.read_csv('city.csv')
def replace_location(label, city_df):
    for item in label:
        if 'Location' in item and item['Location'] is not None:
            location_value = item['Location']
           
            for index, row in city_df.iterrows():
                
                if pd.notna(row['variants']):
                   
                    variants = row['variants']
                    if isinstance(variants, str) and any(variant.strip().lower() == location_value.lower() for variant in variants.split('|')):
                        
                        item['Location'] = row['value']
    return label



for i in range(N):
    print(i)
    print(df_fil['true_label'][i])

    modi = (df_fil['true_label'][i].replace("Incident (case or death)", "Incident").replace("Incident Type (new or total)", "Incident_Type"))

    if type(df_fil['predicted_label'][i]) == float:
        print(df_fil['predicted_label'][i])

    predicted_label_value = (df_fil['predicted_label'][i]).replace("\n", "").replace("\'", "\"")

    if predicted_label_value.strip() and "The given text does not contain information about any disease, therefore it does not fit the required format for extraction." not in predicted_label_value:
        try:
            Pred_labels = json.loads(predicted_label_value)
            t = str(Pred_labels)
            t_list = ast.literal_eval(t)

           
            for j in range(len(t_list)):
                for key, value in t_list[j].items():
                    # Check if the value is a string
                    if isinstance(value, str):
                        
                        t_list[j][key] = value.replace("'", "")
            
           
            Pred_labels = replace_location(t_list, city_df)

        except json.JSONDecodeError as e:
            Pred_labels = []
    else:
        Pred_labels = []

    
    Pred_labels = t_list
    
    tn = str(modi)
    tr_list = ast.literal_eval(tn)
    
    
    True_labels = replace_location(tr_list, city_df)
    
    curr_score = optimal_match_score(True_labels, Pred_labels, "soft")
    curr_score_hard = optimal_match_score(True_labels, Pred_labels, "hard")
    print("curr_score_", curr_score)
    print("curr_score_hard", curr_score_hard)
    total_packets_pred += max(1, len(Pred_labels))
    total_packets_true += max(1, len(True_labels))
    total_score += curr_score
    total_score_hard += curr_score_hard
    df_soft.loc[len(df_soft.index)] = [curr_score]

   
    df_hard.loc[len(df_hard.index)] = [curr_score_hard]

df_soft.to_csv('soft_scores.csv', index=False)
df_hard.to_csv('hard_scores.csv', index=False)

cumulative_score = total_score / total_packets_true
cumulative_score_hard = total_score_hard / total_packets_true
p, r, f = PRF2(cumulative_score, total_packets_true, total_packets_pred)
print("Average Soft-match score: ", cumulative_score)
print("Soft Precision: ", p)
print("Soft Recall: ", r)
print("Soft F1: ", f)

print("----------------------")

p, r, f = PRF2(cumulative_score_hard, total_packets_true, total_packets_pred)
print("Average Hard-match score: ", cumulative_score_hard)
print("Hard Precision: ", p)
print("Hard Recall: ", r)
print("Hard F1: ", f)


0
[{'Disease': 'Covid-19', 'Location': 'India ', 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '112'}, {'Disease': 'Covid-19', 'Location': 'India ', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'new', 'Number': '3'}, {'Disease': 'Covid-19', 'Location': 'India', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'total', 'Number': '5,30,677'}, {'Disease': 'Covid-19', 'Location': 'Kerala', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'new', 'Number': '2'}, {'Disease': 'Covid-19 ', 'Location': 'Maharashtra', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'new', 'Number': '1'}]
curr_score_ 4.0
curr_score_hard 4
1
[{'Disease': 'Dengue', 'Location': 'Indore', 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '10'}, {'Disease': 'Malaria', 'Location': 'Indore', 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new',

In [18]:
df_soft

Unnamed: 0,curr_score
0,4.0
1,1.0
2,3.6
3,2.0
4,2.4
...,...
386,1.0
387,2.8
388,1.0
389,3.0


In [19]:
df_hard

Unnamed: 0,curr_score_hard
0,4
1,1
2,0
3,2
4,0
...,...
386,1
387,0
388,1
389,0


In [21]:
df_fil

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...",,"[\n {\n ""Disease"": ""Covid-19"",\n ""Locat..."
1,Indore Reports 10 Vector Borne Diseases In A W...,"[{'Disease': 'Dengue', 'Location': 'Indore', '...",,"[{\n'Disease': 'dengue and malaria', \n'Locati..."
2,"India sees 30% rise in Covid tally with 2,786 ...","[{'Disease': 'Covid ', 'Location': 'India', 'I...",,"[\n{\n""Disease"": ""COVID-19"",\n""Location"": ""Ind..."
3,"85 New Covid Cases Reported In City, Dist | Va...","[{'Disease': 'Covid', 'Location': 'Vadodara', ...",,"[{'Disease': 'Covid-19', 'Location': 'Vadodara..."
4,COVID-19 cases rise to 98 due to institutional...,"[{'Disease': 'COVID-19', 'Location': 'Chennai ...",,"[\n {\n ""Disease"": ""COVID-19"",\n ..."
...,...,...,...,...
386,Two more foreigners found Covid-19 positive in...,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay...",,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay..."
387,corona cases in india today: Increased corona ...,"[{'Disease': 'Corona', 'Location': 'Kerela', '...",,"[\n {\n 'Disease': 'Covid19',\n 'Locati..."
388,telangana corona cases: TS: New cases approach...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[{\n""Disease"": ""Corona"",\n""Location"": ""Telanga..."
389,A thousand cases increased in 24 hours Prajasa...,"[{'Disease': 'Corona', 'Location': 'India', 'I...",,"[{'Disease': 'Corona', 'Location': 'New Delhi'..."


In [22]:
soft=df_soft.iloc[:,0]
hard=df_hard.iloc[:,0]

In [23]:
soft

0      4.0
1      1.0
2      3.6
3      2.0
4      2.4
      ... 
386    1.0
387    2.8
388    1.0
389    3.0
390    3.8
Name: curr_score, Length: 391, dtype: float64

In [24]:
result

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,"Bus falls off bridge in Jharkhand; 6 killed, s...",[],,The provided news article does not include any...
1,"11 killed, 10 injured as jeep rams into tracto...",[],,[]
2,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...",,"[\n {\n ""Disease"": ""Covid-19"",\n ""Locat..."
3,,[],,
4,"On Children's Day-eve, top specialist says con...",[],,"[{'Disease': 'congenital heart disease', 'Loca..."
...,...,...,...,...
699,Mumbai Terror Attacks: 14 years of carnage.. C...,[],,The provided news article does not include any...
700,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[\n{""Disease"": ""Corona"", ""Location"": ""Telangan..."
701,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],,"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ..."
702,8 laborers died when the truck overturned. Bih...,[],,[]


In [25]:
import pandas as pd


new_data = pd.DataFrame({
    'article': df_fil['article'],
    'true_label': df_fil['true_label'],
    'predicted_label(Chat_gpt)': df_fil['predicted_label'],
    'Soft_score':soft,
    'Hard_score':hard,
    
})

new_data.to_csv('Soft_Hard_Scores_Chat_gpt_1.csv', index=False)

In [29]:
data=pd.read_csv('Soft_Hard_Scores_Chat_gpt_1.csv')
data

Unnamed: 0,article,true_label,predicted_label(Chat_gpt),Soft_score,Hard_score
0,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'Covid-19', 'Location': 'India ',...","[\n {\n ""Disease"": ""Covid-19"",\n ""Locat...",4.0,4
1,Indore Reports 10 Vector Borne Diseases In A W...,"[{'Disease': 'Dengue', 'Location': 'Indore', '...","[{\n'Disease': 'dengue and malaria', \n'Locati...",1.0,1
2,"India sees 30% rise in Covid tally with 2,786 ...","[{'Disease': 'Covid ', 'Location': 'India', 'I...","[\n{\n""Disease"": ""COVID-19"",\n""Location"": ""Ind...",3.6,0
3,"85 New Covid Cases Reported In City, Dist | Va...","[{'Disease': 'Covid', 'Location': 'Vadodara', ...","[{'Disease': 'Covid-19', 'Location': 'Vadodara...",2.0,2
4,COVID-19 cases rise to 98 due to institutional...,"[{'Disease': 'COVID-19', 'Location': 'Chennai ...","[\n {\n ""Disease"": ""COVID-19"",\n ...",2.4,0
...,...,...,...,...,...
386,Two more foreigners found Covid-19 positive in...,"[{'Disease': 'Covid-19', 'Location': 'Bodh Gay...","[{'Disease': 'Covid-19', 'Location': 'Bodh Gay...",1.0,1
387,corona cases in india today: Increased corona ...,"[{'Disease': 'Corona', 'Location': 'Kerela', '...","[\n {\n 'Disease': 'Covid19',\n 'Locati...",2.8,0
388,telangana corona cases: TS: New cases approach...,"[{'Disease': 'Corona', 'Location': 'Telangana'...","[{\n""Disease"": ""Corona"",\n""Location"": ""Telanga...",1.0,1
389,A thousand cases increased in 24 hours Prajasa...,"[{'Disease': 'Corona', 'Location': 'India', 'I...","[{'Disease': 'Corona', 'Location': 'New Delhi'...",3.0,0


Created disease.csv and applied on this.

In [18]:
import json
import ast
import numpy as np
import pandas as pd



total_score = 0
total_num_packets = 0
all_labels = []
total_packets_pred = 0
total_packets_true = 0
total_score_hard = 0
t_list = []
tr_list = []
ct = df_fil['predicted_label'].isna().sum()
df_soft = pd.DataFrame(columns=['curr_score'])
df_hard = pd.DataFrame(columns=['curr_score_hard'])
df_pred = pd.DataFrame(columns=['Pred'])
df_true = pd.DataFrame(columns=['True'])
N = len(df_fil)
city_df=pd.read_csv('city.csv')
disease_df = pd.read_csv('disease.csv')
def replace_location(label, city_df):
    for item in label:
        if 'Location' in item and item['Location'] is not None:
            location_value = item['Location']
           
            for index, row in city_df.iterrows():
                
                if pd.notna(row['variants']):
                   
                    variants = row['variants']
                    if isinstance(variants, str) and any(variant.strip().lower() == location_value.strip().lower() for variant in variants.split('|')):
                        
                        item['Location'] = row['value']
    return label
def replace_disease(label, disease_df):
    for item in label:
        if 'Disease' in item and item['Disease'] is not None:
            disease_value = item['Disease']
            for index, row in disease_df.iterrows():
                if pd.notna(row['variants']):
                    variants = row['variants']
                    if isinstance(variants, str) and any(variant.strip().lower() == disease_value.strip().lower() for variant in variants.split('|')):
                        item['Disease'] = row['value']
                        #print("Done",item['Disease'])
    return label




for i in range(N):
    if(i%10)==0:
        print(i)
    #print(df_fil['article'][i])


    modi = (df_fil['true_label'][i].replace("Incident (case or death)", "Incident").replace("Incident Type (new or total)", "Incident_Type"))

    if type(df_fil['predicted_label'][i]) == float:
        print(df_fil['predicted_label'][i])

    predicted_label_value = (df_fil['predicted_label'][i]).replace("\n", "").replace("\'", "\"").replace('unknown','').replace('Unspecified','')
    

    if predicted_label_value.strip() and "The given text does not contain information about any disease, therefore it does not fit the required format for extraction." not in predicted_label_value:
        try:
            Pred_labels = json.loads(predicted_label_value)
            t = str(Pred_labels)
            t_list = ast.literal_eval(t)

           
            for j in range(len(t_list)):
                for key, value in t_list[j].items():
                    # Check if the value is a string
                    if isinstance(value, str):
                        
                        t_list[j][key] = value.replace("'", "")
            
           
            Pred_labels = replace_location(t_list, city_df)

        except json.JSONDecodeError as e:
            Pred_labels = []
    else:
        Pred_labels = []

    
   
    
    tn = str(modi)
    tr_list = ast.literal_eval(tn)
    
    
    True_labels = replace_location(tr_list, city_df)
    tn1=str(True_labels)
    tr1_list=ast.literal_eval(tn1)
    True_labels = replace_disease(tr1_list, disease_df)
    
    tp1=str(Pred_labels)
    tp1_list=ast.literal_eval(tp1)
    Pred_labels=replace_disease(tp1_list,disease_df)
    
    #print("true_label: ",True_labels)
    #print("Pred_label: ",Pred_labels)
    curr_score = optimal_match_score(True_labels, Pred_labels, "soft")
    curr_score_hard = optimal_match_score(True_labels, Pred_labels, "hard")
    #print("curr_score_", curr_score)
    #print("curr_score_hard", curr_score_hard)
    total_packets_pred += max(1, len(Pred_labels))
    total_packets_true += max(1, len(True_labels))
    total_score += curr_score
    total_score_hard += curr_score_hard
    df_soft.loc[len(df_soft.index)] = [curr_score]

   
    df_hard.loc[len(df_hard.index)] = [curr_score_hard]
    df_pred.loc[len(df_pred.index)] = [Pred_labels]
    df_true.loc[len(df_true.index)] = [True_labels]

df_soft.to_csv('soft_scores.csv', index=False)
df_hard.to_csv('hard_scores.csv', index=False)

cumulative_score = total_score / total_packets_true
cumulative_score_hard = total_score_hard / total_packets_true
p, r, f = PRF2(cumulative_score, total_packets_true, total_packets_pred)
print("Average Soft-match score: ", cumulative_score)
print("Soft Precision: ", p)
print("Soft Recall: ", r)
print("Soft F1: ", f)

print("----------------------")

p, r, f = PRF2(cumulative_score_hard, total_packets_true, total_packets_pred)
print("Average Hard-match score: ", cumulative_score_hard)
print("Hard Precision: ", p)
print("Hard Recall: ", r)
print("Hard F1: ", f)


0


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
Average Soft-match score:  0.8441281138790027
Soft Precision:  0.7094715852442665
Soft Recall:  0.8441281138790027
Soft F1:  0.7709642470205842
----------------------
Average Hard-match score:  0.5860023724792408
Hard Precision:  0.49252243270189433
Hard Recall:  0.5860023724792408
Hard F1:  0.5352112676056339


In [19]:
df_pred

Unnamed: 0,Pred
0,"[{'Disease': 'corona', 'Location': 'India', 'I..."
1,"[{'Disease': 'dengue and malaria', 'Location':..."
2,"[{'Disease': 'corona', 'Location': 'India', 'I..."
3,"[{'Disease': 'corona', 'Location': 'Vadodara',..."
4,"[{'Disease': 'corona', 'Location': 'Chennai', ..."
...,...
386,"[{'Disease': 'corona', 'Location': 'Bodh Gaya'..."
387,"[{'Disease': 'corona', 'Location': 'India', 'I..."
388,"[{'Disease': 'corona', 'Location': 'Telangana'..."
389,"[{'Disease': 'corona', 'Location': 'New Delhi'..."


In [20]:
df_true

Unnamed: 0,True
0,"[{'Disease': 'corona', 'Location': 'India ', '..."
1,"[{'Disease': 'Dengue', 'Location': 'Indore', '..."
2,"[{'Disease': 'corona', 'Location': 'India', 'I..."
3,"[{'Disease': 'corona', 'Location': 'Vadodara',..."
4,"[{'Disease': 'corona', 'Location': 'Chennai', ..."
...,...
386,"[{'Disease': 'corona', 'Location': 'Bodh Gaya'..."
387,"[{'Disease': 'corona', 'Location': 'Kerela', '..."
388,"[{'Disease': 'corona', 'Location': 'Telangana'..."
389,"[{'Disease': 'corona', 'Location': 'India', 'I..."


In [23]:
soft=df_soft.iloc[:,0]
hard=df_hard.iloc[:,0]
pred=df_pred.iloc[:,0]
true=df_true.iloc[:,0]

In [24]:
pred

0      [{'Disease': 'corona', 'Location': 'India', 'I...
1      [{'Disease': 'dengue and malaria', 'Location':...
2      [{'Disease': 'corona', 'Location': 'India', 'I...
3      [{'Disease': 'corona', 'Location': 'Vadodara',...
4      [{'Disease': 'corona', 'Location': 'Chennai', ...
                             ...                        
386    [{'Disease': 'corona', 'Location': 'Bodh Gaya'...
387    [{'Disease': 'corona', 'Location': 'India', 'I...
388    [{'Disease': 'corona', 'Location': 'Telangana'...
389    [{'Disease': 'corona', 'Location': 'New Delhi'...
390    [{'Disease': 'corona', 'Location': 'Telangana'...
Name: Pred, Length: 391, dtype: object

In [25]:
true

0      [{'Disease': 'corona', 'Location': 'India ', '...
1      [{'Disease': 'Dengue', 'Location': 'Indore', '...
2      [{'Disease': 'corona', 'Location': 'India', 'I...
3      [{'Disease': 'corona', 'Location': 'Vadodara',...
4      [{'Disease': 'corona', 'Location': 'Chennai', ...
                             ...                        
386    [{'Disease': 'corona', 'Location': 'Bodh Gaya'...
387    [{'Disease': 'corona', 'Location': 'Kerela', '...
388    [{'Disease': 'corona', 'Location': 'Telangana'...
389    [{'Disease': 'corona', 'Location': 'India', 'I...
390    [{'Disease': 'corona', 'Location': 'Telangana'...
Name: True, Length: 391, dtype: object

In [26]:
import pandas as pd


new_data = pd.DataFrame({
    'article': df_fil['article'],
    'true_label': true,
    'predicted_label(Chat_gpt)': pred,
    'Soft_score':soft,
    'Hard_score':hard,
    
})

new_data.to_csv('Soft_Hard_Scores_Chat_gpt_final.csv', index=False)

In [27]:
data=pd.read_csv('Soft_Hard_Scores_Chat_gpt_final.csv')
data

Unnamed: 0,article,true_label,predicted_label(Chat_gpt),Soft_score,Hard_score
0,"Covid-19: India sees drop in daily cases, logs...","[{'Disease': 'corona', 'Location': 'India ', '...","[{'Disease': 'corona', 'Location': 'India', 'I...",4.0,4
1,Indore Reports 10 Vector Borne Diseases In A W...,"[{'Disease': 'Dengue', 'Location': 'Indore', '...","[{'Disease': 'dengue and malaria', 'Location':...",1.0,1
2,"India sees 30% rise in Covid tally with 2,786 ...","[{'Disease': 'corona', 'Location': 'India', 'I...","[{'Disease': 'corona', 'Location': 'India', 'I...",4.4,4
3,"85 New Covid Cases Reported In City, Dist | Va...","[{'Disease': 'corona', 'Location': 'Vadodara',...","[{'Disease': 'corona', 'Location': 'Vadodara',...",2.0,2
4,COVID-19 cases rise to 98 due to institutional...,"[{'Disease': 'corona', 'Location': 'Chennai', ...","[{'Disease': 'corona', 'Location': 'Chennai', ...",3.0,3
...,...,...,...,...,...
386,Two more foreigners found Covid-19 positive in...,"[{'Disease': 'corona', 'Location': 'Bodh Gaya'...","[{'Disease': 'corona', 'Location': 'Bodh Gaya'...",1.0,1
387,corona cases in india today: Increased corona ...,"[{'Disease': 'corona', 'Location': 'Kerela', '...","[{'Disease': 'corona', 'Location': 'India', 'I...",3.6,2
388,telangana corona cases: TS: New cases approach...,"[{'Disease': 'corona', 'Location': 'Telangana'...","[{'Disease': 'corona', 'Location': 'Telangana'...",1.0,1
389,A thousand cases increased in 24 hours Prajasa...,"[{'Disease': 'corona', 'Location': 'India', 'I...","[{'Disease': 'corona', 'Location': 'New Delhi'...",3.0,0
