In [1]:
# Imports:
import pandas as pd

In [2]:
# Load the original data:
df = pd.read_csv('Combined_SPA_Callback.csv')
df

Unnamed: 0,Date,ID,Text
0,26-Jul-17,1.010040e+11,please call daughter Dianne Thomas 07920 07565...
1,28-Jul-17,1.010040e+11,please speak to sister - Alyson Powell on abpv...
2,28-Jul-17,1.010040e+11,KELLY HAS RUNG - HER PARENTS ARE DUE TO HAVE W...
3,31-Jul-17,1.010040e+11,"Mark Hitchings, Scheme Manager Swn Yr Afon con..."
4,28-Jul-17,1.010040e+11,SON BRIAN HASFORD RUN G- HE IS RESIDENT IN AUS...
...,...,...,...
11353,13-Jun-23,1.010000e+11,"Enquiring about a downstairs toilet, states he..."
11354,13-Jun-23,1.010000e+11,Phoning on behalf on her mother she has no fee...
11355,13-Jun-23,1.010000e+11,States on saturday they had to call an ambulan...
11356,13-Jun-23,1.010000e+11,Nadia Obaji daughter of Abdel Obaji contacted ...


In [3]:
# Load the result data:
df_dict = {}

for i in range(1,5):
    df_name = "df_urgent{}".format(i)
    df_value = pd.read_csv('Huggingface_Results\Model_Results_Urgent{}.csv'.format(i))
    # Create df of entailments:
    df_value = df_value[df_value["Result"]=="Entailment"]
    df_dict[df_name] = df_value

In [4]:
# Find number of entailments in each result to choose top 3:
for df_name, data in df_dict.items():
    print(df_name,len(data))

df_urgent1 343
df_urgent2 55
df_urgent3 393
df_urgent4 1460


In [5]:
# Selecting the indexes of the original df:
idx0 = set(df.index)

# Selecting the indexes of only the Top 3:
idx1 = set(df_dict["df_urgent1"].index)
idx2 = set(df_dict["df_urgent3"].index)
idx3 = set(df_dict["df_urgent4"].index)

# Find elements common in all sets:
common_3 = list(idx1 & idx2 & idx3)

# Find elements common in only any 2 sets:
common_2 = list(((idx1 & idx2)|(idx2 & idx3)|(idx1 & idx3)) - set(common_3))

# Find elements deemed Urgent in only 1 set:
common_1 = list((idx1 | idx2 | idx3) - set(common_2) - set(common_3))

# Find elements not in any set:
uncommon = list(idx0 - set(common_1) - set(common_2) - set(common_3))

print("Urgent in 3: {}\nUrgent in 2: {}\nUrgent in 1: {}\nUrgent in None: {}".format(len(common_3),len(common_2),len(common_1),len(uncommon)))

Urgent in 3: 217
Urgent in 2: 211
Urgent in 1: 1123
Urgent in None: 9807


In [6]:
import random

# Selecting 25 data points at random from each common list:
no_of_examples = 25
random_uncommon = random.choices(uncommon, k=no_of_examples)
random_common_1 = random.choices(common_1, k=no_of_examples)
random_common_2 = random.choices(common_2, k=no_of_examples)
random_common_3 = random.choices(common_3, k=no_of_examples)

### Preparing the testcases:

In [7]:
# Creating sample df and adding Urgent column to test results:

df_random_uncommon = pd.DataFrame(df.iloc[random_uncommon]["Text"])
df_random_uncommon["Urgent"] = 0

df_random_common_1 = pd.DataFrame(df.iloc[random_common_1]["Text"])
df_random_common_1["Urgent"] = 1

df_random_common_2 = pd.DataFrame(df.iloc[random_common_2]["Text"])
df_random_common_2["Urgent"] = 2

df_random_common_3 = pd.DataFrame(df.iloc[random_common_3]["Text"])
df_random_common_3["Urgent"] = 3

In [8]:
# Concatenate the DataFrames:
testing_df = pd.concat([df_random_uncommon, df_random_common_1, df_random_common_2, df_random_common_3], ignore_index=False)

# Randomly shuffle the rows:
randomized_results_df = testing_df.sample(frac=1, random_state=42)

# Save the results for future use:
randomized_results_df.to_csv("urgency_results.csv", index=True)

In [9]:
# Create test by removing results:
randomized_test_df = randomized_results_df.copy()
randomized_test_df["Urgent"] = 1
randomized_test_df.to_csv("urgency_test.csv", index=True)

### Getting Results:

In [10]:
# Load the completed test:
completed_test_df = pd.read_csv("urgency_test.csv",index_col=0)
completed_test_df

Unnamed: 0,Text,Urgent
10450,TEL CALL STEP DAUGHTER - VERY WORRIED ABOUT M...,1
4375,would like a grabrail asap please as strugglin...,1
6032,Tel call neighbour Mrs Rowlands (wants to rema...,1
631,call from sister Angela would like to have a s...,1
10259,she came out of hospital sat but is finding it...,1
...,...,...
8533,STATES HER HUSBAND HAD A FULL KNEE REPLACEMENT...,1
4527,BATHING ASSESMENT NEEDED,1
3154,"prescription needs to be picked up today, self...",1
8747,HOMECARE/MOBILITY ASSESSMENT REQUIRED URGENTLY...,1


In [11]:
randomized_results_df

Unnamed: 0,Text,Urgent
10450,TEL CALL STEP DAUGHTER - VERY WORRIED ABOUT M...,3
4375,would like a grabrail asap please as strugglin...,2
6032,Tel call neighbour Mrs Rowlands (wants to rema...,2
631,call from sister Angela would like to have a s...,1
10259,she came out of hospital sat but is finding it...,1
...,...,...
8533,STATES HER HUSBAND HAD A FULL KNEE REPLACEMENT...,2
4527,BATHING ASSESMENT NEEDED,2
3154,"prescription needs to be picked up today, self...",0
8747,HOMECARE/MOBILITY ASSESSMENT REQUIRED URGENTLY...,3


In [12]:
wrong_0 = 0
wrong_1 = 0
wrong_2 = 0
wrong_3 = 0

# Comparing results to test:
for df1_row, df2_row in zip(completed_test_df.itertuples(), randomized_results_df.itertuples()):
    test = df1_row.Urgent
    gt = df2_row.Urgent
    if test == 0:
        if gt == 1:
            wrong_1+=1
        elif gt == 2:
            wrong_2+=1
        elif gt == 3:
            wrong_3+=1
    else:
        if gt == 0:
            wrong_0+=1

In [13]:
# Confusion Matrix:
FN = wrong_1 + wrong_2 + wrong_3
FP = wrong_0
TP = 75-FN
TN = 25-FP

# Calculating Metrics:
accuracy = 100-(FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1_score = (2*precision*recall)/(precision+recall)

print("Accuracy: {}%".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F1_score: {}".format(f1_score))

Accuracy: 75%
Precision: 0.75
Recall: 1.0
F1_score: 0.8571428571428571
