In [1]:
# Imports:
import pandas as pd
import re
import torch

In [2]:
# Load TE-for-Event-Extraction model:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("veronica320/TE-for-Event-Extraction")
model = AutoModelForSequenceClassification.from_pretrained("veronica320/TE-for-Event-Extraction")

In [3]:
# Load the data:
df = pd.read_csv('Combined_SPA_Callback.csv')
df

Unnamed: 0,Date,ID,Text
0,26-Jul-17,1.010040e+11,please call daughter Dianne Thomas 07920 07565...
1,28-Jul-17,1.010040e+11,please speak to sister - Alyson Powell on abpv...
2,28-Jul-17,1.010040e+11,KELLY HAS RUNG - HER PARENTS ARE DUE TO HAVE W...
3,31-Jul-17,1.010040e+11,"Mark Hitchings, Scheme Manager Swn Yr Afon con..."
4,28-Jul-17,1.010040e+11,SON BRIAN HASFORD RUN G- HE IS RESIDENT IN AUS...
...,...,...,...
11353,13-Jun-23,1.010000e+11,"Enquiring about a downstairs toilet, states he..."
11354,13-Jun-23,1.010000e+11,Phoning on behalf on her mother she has no fee...
11355,13-Jun-23,1.010000e+11,States on saturday they had to call an ambulan...
11356,13-Jun-23,1.010000e+11,Nadia Obaji daughter of Abdel Obaji contacted ...


### Cleaning Data:

In [4]:
# Checking if a sentence is in caps:
def calculate_capital_percentage(string):
    total_letters = len(re.sub(r'[^a-zA-Z]', '', string))
    capital_letters = len(re.sub(r'[^A-Z]', '', string))
    capital_percentage = (capital_letters / total_letters) * 100
    return capital_percentage

# Function to clean text for NER:
def clean_text_for_NER(text):
    # Remove start/trailing spaces:
    text = text.strip()
    # Remove newline characters:
    text = re.sub(r'[\n\r]+', '. ',text)
    # Only keep Alphabets, Digits, Spaces, and Commonly Used Punctuations:
    text = re.sub(r'[^a-zA-Z0-9\s?,:"!.\']', '', text)
    # Remove extra spaces:
    text = re.sub(r'\s{2,}', ' ', text)
    # Join digits together:
    text = re.sub('(?<=\d) (?=\d)', '', text)
    text = text.strip()
    if calculate_capital_percentage(text) > 65:
        text = text.lower()
    # Capitalize the first character:
    text = text.capitalize()
    return text

# Creating a list of sentences:
df["Text"] = df["Text"].apply(clean_text_for_NER)

### Running the model:

In [5]:
second_text_list = ["Urgent",
                    "Urgent social work needed",
                    "Immediate Urgency", 
                    "Some Urgency needed",
                    "Alcohol Involved",
                    "Injury",
                    "Old Age person involved",
                    "Someone is bedridden"]

In [24]:
# This cell takes ~ 2 hours to run:

# Combining the 2 Events to input into model:
first_text = list(df["Text"])
second_text = second_text_list[7]
combined_text = []
for text in first_text:
    text = text + ". " + second_text
    combined_text.append(text)

# Chunking the data into smaller batches:
batch_size = 256
batch_texts = []
for i in range(0, len(combined_text), batch_size):
    batch_texts.append(combined_text[i : i + batch_size])

logits = []
counter = 0
for input_text in batch_texts:
    counter += len(input_text)
    # Tokenizing the text:
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=150)
    # Running inputs through the model:
    with torch.no_grad():
        batch_outputs = model(**inputs)
    logits.append(batch_outputs.logits)
    print("{}/{} inputs done.".format(counter,len(first_text)),end='\r')

# Concatenate lists of logits together:
outputs = {"logits":torch.cat(logits)}

11358/11358 inputs done.

In [25]:
# Get the probabilities for all classes:
probabilities = torch.softmax(outputs["logits"], dim=1).tolist()

# Get the top result:
top_results = torch.argmax(outputs["logits"], dim=1).tolist()

# Create a dictionary to map the labels:
label_map = {0: 'Contradiction', 1: 'Neutral', 2: 'Entailment'}

# Round up the results:
rounded_probs = []
for i in range(len(combined_text)):
    rounded_probs.append([round(prob, 4) for prob in probabilities[i]])

# Combine the predicted label and probabilities into a dictionary:
results_list=[]
for i in range(len(combined_text)):
    results_list.append({label_map[j]: prob for j, prob in enumerate(rounded_probs[i])})
    results_list[i]["Result"] = label_map[top_results[i]]

# Print the result as a dataframe:
results_df = pd.DataFrame(results_list)
results_df

Unnamed: 0,Contradiction,Neutral,Entailment,Result
0,1.0000,0.0000,0.0000,Contradiction
1,0.9999,0.0000,0.0000,Contradiction
2,0.9999,0.0000,0.0001,Contradiction
3,0.9999,0.0000,0.0001,Contradiction
4,0.9996,0.0003,0.0002,Contradiction
...,...,...,...,...
11353,0.9998,0.0000,0.0002,Contradiction
11354,0.0778,0.0007,0.9214,Entailment
11355,0.9998,0.0001,0.0001,Contradiction
11356,0.9997,0.0001,0.0002,Contradiction


In [26]:
final_df = pd.concat([df,results_df], axis=1)
final_df.to_csv("Model_Results_Cleaned_Bed.csv", index=False)
final_df

Unnamed: 0,Date,ID,Text,Contradiction,Neutral,Entailment,Result
0,26-Jul-17,1.010040e+11,Please call daughter dianne thomas 07920075650...,1.0000,0.0000,0.0000,Contradiction
1,28-Jul-17,1.010040e+11,Please speak to sister alyson powell on abpve ...,0.9999,0.0000,0.0000,Contradiction
2,28-Jul-17,1.010040e+11,Kelly has rung her parents are due to have wor...,0.9999,0.0000,0.0001,Contradiction
3,31-Jul-17,1.010040e+11,"Mark hitchings, scheme manager swn yr afon con...",0.9999,0.0000,0.0001,Contradiction
4,28-Jul-17,1.010040e+11,Son brian hasford run g he is resident in aust...,0.9996,0.0003,0.0002,Contradiction
...,...,...,...,...,...,...,...
11353,13-Jun-23,1.010000e+11,"Enquiring about a downstairs toilet, states he...",0.9998,0.0000,0.0002,Contradiction
11354,13-Jun-23,1.010000e+11,Phoning on behalf on her mother she has no fee...,0.0778,0.0007,0.9214,Entailment
11355,13-Jun-23,1.010000e+11,States on saturday they had to call an ambulan...,0.9998,0.0001,0.0001,Contradiction
11356,13-Jun-23,1.010000e+11,Nadia obaji daughter of abdel obaji contacted ...,0.9997,0.0001,0.0002,Contradiction
