In [None]:
import os
import openai
import pandas as pd

# Set up your OpenAI API key
api_key = os.environ['OPENAI_API_KEY']
openai.api_key = api_key

# Function to generate predictions using GPT-4 Chat API
def generate_predictions(content):
    completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": content+ "Given a news article, please extract relevant events in the form of dictionaries. Each event should include keys for 'Disease', 'Location,' 'Incident' (either 'case' or 'death'), 'Incident_Type' (either 'new' or 'total'), and 'Number.' If the 'Disease' key is not present in an event, do not include the event in the result. Additionally, please make sure that no duplicate events are included in the list. Provide the extracted events as a list of dictionaries. If no events are extracted, the result should be an empty list."}]
    )
    return completion.choices[0].message['content']

# Read your DataFrame
data = pd.read_csv('output.csv')

# Open a text file for writing predictions
with open('event.txt', 'w') as txt_file:
    # Iterate over rows and generate predictions
    for index, row in data.iterrows():
        trans_article = row['article']
        if pd.notna(trans_article):  # Skip NaN values
            prediction = generate_predictions(trans_article)
            data.at[index, 'predicted_label'] = prediction
            # Write the prediction to the text file
            txt_file.write(prediction)
        txt_file.write('\n')    

# Save the modified DataFrame to a new CSV file
data.to_csv('Event_Extraction_Chatgpt.csv', index=False)

In [14]:
import pandas as pd
data =pd.read_csv('Event_Extraction_Chatgpt.csv')
type(data['true_label'][1])

str

In [15]:
data

Unnamed: 0,article,true_label,cleaned_true_labels,predicted_label
0,,[],,
1,Fire broke out during gas cylinder refilling i...,[],,The given text does not contain information ab...
2,Increasing cases of lumpy skin disease in Utta...,"[{'Disease': 'Lumpy skin disease', 'Location':...",,"[\n{\n""Disease"": ""lumpy skin disease"",\n""Locat..."
3,"In Jharkhand's Kasturba School, 28 girls fell ...",[],,"[{""Disease"": ""food poisoning"", ""Location"": ""Ka..."
4,Aurangabad news : In Aurangabad in two days he...,[],,"[\n{\n'Disease': 'Heat stroke',\n'Location': '..."
...,...,...,...,...
699,Mumbai Terror Attacks: 14 years of carnage.. C...,[],,The provided news article does not include any...
700,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...",,"[\n{""Disease"": ""Corona"", ""Location"": ""Telangan..."
701,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],,"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ..."
702,8 laborers died when the truck overturned. Bih...,[],,[]


In [20]:
sa = data
sa=pd.read_csv('Event_Extraction_Chatgpt.csv')

sa['predicted_label'] = sa['predicted_label'].apply(lambda x: str([]) if pd.isna(x) else x)

# Using fillna to replace remaining NaN with an empty list
sa['predicted_label'].fillna(str([]), inplace=True)

In [3]:
#Evaluation Metrics for Event Extraction.
import pandas as pd
from scipy.optimize import linear_sum_assignment
data = pd.read_csv('Event_Extraction_Chatgpt.csv')
N = len(data)

def optimal_cost(cost_matrix):
    '''Return the cost of optimal matching by using the bipartite matching algorithm (also known as assingment problem). Objective is to maximize the cost'''
    row_ind, col_ind = linear_sum_assignment(cost_matrix,maximize = True)
    return cost_matrix[row_ind, col_ind].sum()

def isinside(a,b):
    if a.find(b)!=-1 or b.find(a)!=-1:
        return 1
    else:
        return 0

def Soft_match(a,b,key_list):
    '''Soft match is the fraction of the number of keys in a predicted event matched with the GT event out of the total keys'''
    score = 0
    for key in key_list:
        if a[key]==None or b[key]==None or len(str(a[key]))==0 or len(str(b[key]))==0:
            if a[key]==b[key]:
                score+=1
        else:
            if isinside(str(a[key]).replace(",","").lower(),str(b[key]).replace(",","").lower())==1:
                score+=1
             
    return score/len(key_list)


def Hard_match(a,b,key_list):
    '''Hard-match returns 1 if all the keys match between predicted and gt events, else return 0'''
    for key in key_list:
        #print("a[key]",a[key])
        #print("b[key]",b[key])
        if a[key]==None or b[key]==None or len(str(a[key]))==0 or len(str(b[key]))==0:
            if a[key]!=b[key]:
                return 0
        else:
            if isinside(str(a[key]).replace(",","").lower(),str(b[key]).replace(",","").lower())==0:
                return 0
    return 1
def optimal_match_score(N,K,type="soft"):
    '''Master function to get score of optimal-matching between N GT evens and K predicted events.'''
    # print(N,K)
    if N==None or K==None or len(N)==0 or len(K)==0:
        if N==K:
            return 1
        else:
            return 0
    key_list = ["Disease","Location","Incident","Incident_Type","Number"]
    #{'Disease': 'Lumpy skin disease', '': 'Uttarakhand', 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '19404'}, {'Disease': 'Lumpy skin disease', 'Location': 'Uttarkhand', 'Incident (case or death)': 'death', 'Incident Type (new or total)': 'total', 'Number': '321'}]"
    cost_matrix = []
    for i in N:
        if type=="soft":
            temp = [Soft_match(i,j,key_list) for j in K]
        if type=="hard":
            temp = [Hard_match(i,j,key_list) for j in K]
        # print(temp)
        cost_matrix.append(temp)
    cost_matrix = np.array(cost_matrix)
    cost = optimal_cost(cost_matrix)
    return cost
# Evaluation Metric Precision, Recall and F1
def PRF1(score,true,pred):
    precision = score*(min(true,pred))/pred
    recall = score*(min(true,pred))/true
    F1 = (2*precision*recall)/(precision+recall)
    return precision,recall,F1

def PRF2(score,true,pred):
    '''Return precision, recall and F1. 
    1) precision = (number of true events)*score/(number of predicted events)
    2) Recall = score
    3) F1 = 2*P*R/(P+R)'''
    precision = score*(true)/pred
    recall = score
    F1 = (2*precision*recall)/(precision+recall)
    return precision,recall,F1

In [111]:
print((data.iloc[481,:])['true_label'])
print((data.iloc[481,:])['predicted_label'])


[{'Disease': 'H3N2', 'Location': "J'khand", 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '1'}, {'Disease': 'COVID-19', 'Location': "J'khand", 'Incident (case or death)': 'case', 'Incident Type (new or total)': 'new', 'Number': '5'}]
[
  {
    "Disease": "H3N2",
    "Location": "J'khand",
    "Incident": "case",
    "Incident_Type": "new",
    "Number": 1
  },
  {
    "Disease": "COVID-19",
    "Location": "J'khand",
    "Incident": "case",
    "Incident_Type": "new",
    "Number": 5
  }
]


In [25]:
#we = data
we=pd.read_csv('Event_Extraction_Chatgpt.csv')

we['predicted_label'] = we['predicted_label'].apply(lambda x: str([]) if pd.isna(x) else x)

# Using fillna to replace remaining NaN with an empty list
we['predicted_label'].fillna(str([]), inplace=True)


In [41]:
import json
import ast
import numpy as np
total_score = 0
total_num_packets = 0
all_labels = []
total_packets_pred= 0
total_packets_true = 0
total_score_hard = 0
t_list = []
tr_list = []
ct = we['predicted_label'].isna().sum()
df_soft = pd.DataFrame(columns=['curr_score'])
df_hard = pd.DataFrame(columns=['curr_score_hard'])

print("ct",ct)
for i in range(1,N):
    print(i)
    
    
    modi = (we['true_label'][i].replace("Incident (case or death)","Incident").replace("Incident Type (new or total)","Incident_Type"))

    #if i == 113:
        #print("Original String:")
        #print(we['true_label'][i])

        #print("Modified String:")
        #print(modi)
    #True_labels = json.loads(modi)
    if(type(we['predicted_label'][i])== float):
        print(we['predicted_label'][i])
    predicted_label_value = (we['predicted_label'][i]).replace("\n","").replace("\'", "\"")
    
    if predicted_label_value.strip() and "The given text does not contain information about any disease, therefore it does not fit the required format for extraction." not in predicted_label_value:
        
        try:
            Pred_labels = json.loads(predicted_label_value)
            t = str(Pred_labels)
            t_list = ast.literal_eval(t)

            # Iterate through dictionaries in the list
            for j in range(len(t_list)):
                for key, value in t_list[j].items():
                    # Check if the value is a string
                    if isinstance(value, str):
                        # Replace single quote (') with an empty string
                        t_list[j][key] = value.replace("'", "") 
        except json.JSONDecodeError as e:
            Pred_labels = []
    else:
        Pred_labels = []

    # Replace Pred_labels with t_list
    Pred_labels = t_list
    tn = str(modi)
    tr_list = ast.literal_eval(tn)
    for j in range(len(tr_list)):
        for key, value in tr_list[j].items():
            if isinstance(value, str):
                tr_list[j][key] = value.replace("'", "")

    True_labels =  tr_list  
    #if(i==113):
        #print(True_labels)     
    curr_score = optimal_match_score(True_labels,Pred_labels,"soft")
    curr_score_hard = optimal_match_score(True_labels,Pred_labels,"hard")
    print("curr_score_",curr_score)
    print("curr_score_hard",curr_score_hard)
    total_packets_pred+=max(1,len(Pred_labels))
    total_packets_true+=max(1,len(True_labels))
    total_score +=curr_score
    total_score_hard+=curr_score_hard
    df_soft.loc[len(df_soft.index)] = [curr_score]

    # Append curr_score_hard to df_hard
    df_hard.loc[len(df_hard.index)] = [curr_score_hard]
df_soft.to_csv('soft_scores.csv', index=False)
df_hard.to_csv('hard_scores.csv', index=False)

    
cumulative_score = total_score/total_packets_true
cumulative_score_hard = total_score_hard/total_packets_true
p,r,f = PRF2(cumulative_score,total_packets_true,total_packets_pred)
print("Average Soft-match score: ",cumulative_score)
print("Soft Precision: ",p)
print("Soft Recall: ",r)
print("Soft F1: ",f)

print("----------------------")

p,r,f = PRF2(cumulative_score_hard,total_packets_true,total_packets_pred)
print("Average Hard-match score: ",cumulative_score_hard)
print("Hard Precision: ",p)
print("Hard Recall: ",r)
print("Hard F1: ",f)

ct 0
1
curr_score_ 1
curr_score_hard 1
2
curr_score_ 1.6
curr_score_hard 0
3
curr_score_ 0
curr_score_hard 0
4
curr_score_ 0
curr_score_hard 0
5
curr_score_ 0.6
curr_score_hard 0
6
curr_score_ 1.0
curr_score_hard 1
7
curr_score_ 0
curr_score_hard 0
8
curr_score_ 3.0
curr_score_hard 3
9
curr_score_ 2.0
curr_score_hard 2
10
curr_score_ 1.0
curr_score_hard 1
11
curr_score_ 1
curr_score_hard 1
12
curr_score_ 2.0
curr_score_hard 2
13
curr_score_ 1
curr_score_hard 1
14
curr_score_ 1.0
curr_score_hard 1
15
curr_score_ 1.0
curr_score_hard 1
16
curr_score_ 1.6
curr_score_hard 0
17
curr_score_ 2.8
curr_score_hard 2
18
curr_score_ 0.8
curr_score_hard 0
19
curr_score_ 0.8
curr_score_hard 0
20
curr_score_ 0.4
curr_score_hard 0
21
curr_score_ 2.8000000000000003
curr_score_hard 0
22
curr_score_ 0
curr_score_hard 0
23
curr_score_ 1
curr_score_hard 1
24
curr_score_ 0
curr_score_hard 0
25
curr_score_ 1.0
curr_score_hard 1
26
curr_score_ 2.4000000000000004
curr_score_hard 0
27
curr_score_ 0
curr_score_ha

In [42]:
df_soft

Unnamed: 0,curr_score
0,1.0
1,1.6
2,0.0
3,0.0
4,0.6
...,...
698,1.0
699,4.0
700,0.0
701,1.0


In [44]:
df_hard 


Unnamed: 0,curr_score_hard
0,1
1,0
2,0
3,0
4,0
...,...
698,1
699,4
700,0
701,1


In [77]:
soft=df_soft.iloc[:,0]
hard=df_hard.iloc[:,0]
#soft
hard

0      1
1      0
2      0
3      0
4      0
      ..
698    1
699    4
700    0
701    1
702    1
Name: curr_score_hard, Length: 703, dtype: int64

In [78]:
import pandas as pd

da = pd.read_csv('Event_Extraction_Chatgpt.csv')
da=da.drop(0)
da.to_csv('your_file.csv', index=False)
ga=pd.read_csv('your_file.csv')
#print(article_column)
new_data = pd.DataFrame({
    'article': ga['article'],
    'true_label': ga['true_label'],
    'predicted_label(Chat_gpt)': ga['predicted_label'],
    'Soft_score':soft,
    'Hard_score':hard,
    
})

new_data.to_csv('Soft_Hard_Scores_Chat_gpt.csv', index=False)




In [70]:
len(da)

703

In [79]:
dm=pd.read_csv('Soft_Hard_Scores_Chat_gpt.csv')
dm

Unnamed: 0,article,true_label,predicted_label(Chat_gpt),Soft_score,Hard_score
0,Fire broke out during gas cylinder refilling i...,[],The given text does not contain information ab...,1.0,1
1,Increasing cases of lumpy skin disease in Utta...,"[{'Disease': 'Lumpy skin disease', 'Location':...","[\n{\n""Disease"": ""lumpy skin disease"",\n""Locat...",1.6,0
2,"In Jharkhand's Kasturba School, 28 girls fell ...",[],"[{""Disease"": ""food poisoning"", ""Location"": ""Ka...",0.0,0
3,Aurangabad news : In Aurangabad in two days he...,[],"[\n{\n'Disease': 'Heat stroke',\n'Location': '...",0.0,0
4,Ambikapur News:Four people of the same family ...,"[{'Disease': 'Diarrhea', 'Location': 'Ambikapu...","[{'Disease': 'food poisoning', 'Location': 'Pa...",0.6,0
...,...,...,...,...,...
698,Mumbai Terror Attacks: 14 years of carnage.. C...,[],The provided news article does not include any...,1.0,1
699,Corona cases increased again in Telangana - On...,"[{'Disease': 'Corona', 'Location': 'Telangana'...","[\n{""Disease"": ""Corona"", ""Location"": ""Telangan...",4.0,4
700,"IPL 2022, CSK Vs DC: Corona once again in IPL....",[],"[{""Disease"": ""Covid 19"", ""Location"": ""Delhi"", ...",0.0,0
701,8 laborers died when the truck overturned. Bih...,[],[],1.0,1
