# GPT-3.5 Model

### Importing libraries

In [None]:
import pandas as pd
import ast
import re
import random
import json
import csv

from openai import OpenAI
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

from fuzzywuzzy import fuzz


### Load and split data

In [None]:
df = pd.read_csv('final_data.csv')

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

print("Train set size:", len(train))
print("Test set size:", len(test))

### Convert to GPT format

In [None]:
def get_labeled_format(df):
    spacy_data = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Creating training data"):
        text = row['Cleaned Text']
        org_name = row['True Organization']
        
        # escape special characters in the organization name
        escaped_org_name = re.escape(org_name)
        
        # pattern to find the organization name
        pattern = rf"\b{escaped_org_name}\b"
        
        entities = []
        
        # find all matches 
        for match in re.finditer(pattern, text):
            start_index = match.start()
            end_index = match.end()
            
            # add the entity to the list
            entities.append((start_index, end_index, 'ORG'))
        
        spacy_data.append([text, {'entities': entities}])
    return spacy_data


In [None]:
train_data_labeled = get_labeled_format(train)
test_data_labeled = get_labeled_format(test)
train_data_labeled = [entry for entry in train_data_labeled if len(entry[1]['entities']) > 0]

In [None]:
def get_gpt_format(data):
    formatted_data = []

    for data in data:
        text = data[0]
        entities = data[1]['entities']
        for entity in entities:
            start_index, end_index, entity_type = entity
            org_name = text[start_index:end_index]
        content = f"JAARVERSLAG: {text}" # English: f"ANNUAL REPORT: {text}"
        message = {
            "messages": [ # English: From the following ANNUAL REPORT, provide the organization name. Answer with only the organization name in lowercase.
                {"role": "system", "content": "Geef van het volgende JAARVERSLAG de volledige naam van de organisatie, inclusief woorden zoals 'stichting' en 'gemeente'. Als er in de tekst een uitgeschreven versie van de organisatienaam staat, geef dan die in plaats van de afkorting. Antwoord met alleen de volledige organisatienaam in kleine letters."},
                {"role": "user", "content": content},
                {"role": "assistant", "content": org_name}
            ]
        }
        formatted_data.append(message)

    return formatted_data

In [None]:
train_data = get_gpt_format(train_data_labeled)
test_final = get_gpt_format(test_data_labeled)

train_data[:10]

### Connect OpenAI

In [None]:
client = OpenAI(
  api_key= 'YOURAPIKEY',
)

### Function to create output

In [None]:
def create_output(test_data, finetuned_id):
    output = []
    i = 0

    for item in tqdm(test_data, desc="Generating output"):
        user_message = next(msg for msg in item['messages'] if msg['role'] == 'user')
        content = f"JAARVERSLAG: {user_message['content']}" # English: f"ANNUAL REPORT: {user_message['content']}"
        i += 1

        # print(f"Processing item {i}")
        # print(f"Content: {content}\n")
        
        try:
            # generate completion using the fine-tuned model
            completion = client.chat.completions.create(
                model=finetuned_id,
                messages=[ # English: From the following ANNUAL REPORT, provide the organization name. Answer with only the organization name in lowercase.
                    {"role": "system", "content": "Geef van het volgende JAARVERSLAG de volledige naam van de organisatie, inclusief woorden zoals 'stichting' en 'gemeente'. Als er in de tekst een uitgeschreven versie van de organisatienaam staat, geef dan die in plaats van de afkorting. Antwoord met alleen de volledige organisatienaam in kleine letters."},
                    {"role": "user", "content": content}
                ]
            )
            
            # extract the predicted organization name from the completion
            predicted_org = completion.choices[0].message.content.lower()
            output.append((user_message["content"], predicted_org))
        except Exception as e:
            print(f"Error processing item {i}: {e}")
            output.append((user_message["content"], 'No prediction'))
            continue
    
    return output


## Base Model

### Predictions

In [None]:
test_output = create_output(test_final, "gpt-3.5-turbo")

### Evaluation

In [None]:
def accuracy_base(test, test_output):
    true_org_dict = test.set_index('Cleaned Text')['True Organization'].to_dict()

    correct_predictions = 0
    total_predictions = len(test_output)

    # iterate over test_output and check if the text and organization name match the dataframe
    for text, organization_name in test_output:
        # check if the text is in the dataframe
        if text[13:] in true_org_dict:
            true_organization_name = str(true_org_dict[text[13:]])
            print(f'True: {true_organization_name}, Pred: {organization_name}')
            # check if the organization name partially matches the true organization
            if true_organization_name.lower() == organization_name.lower():
                correct_predictions += 1
            elif true_organization_name.lower() in organization_name.lower():
                correct_predictions += 1
            elif organization_name.lower() in true_organization_name.lower():
                correct_predictions += 1

    accuracy = correct_predictions / total_predictions

    return f"Accuracy: {accuracy * 100:.2f}%"

In [None]:
def fuzzy_accuracy(test, test_output):
    true_org_dict = test.set_index('Cleaned Text')['True Organization'].to_dict()

    correct_predictions = 0
    total_predictions = len(test_output)

    # iterate over test_output and check if the text and organization name match the dataframe
    for text, organization_name in test_output:
        # check if the text is in the dataframe
        if text[13:] in true_org_dict:
            true_organization_name = str(true_org_dict[text[13:]])
            print(f'True: {true_organization_name}, Pred: {organization_name}')
            
            # normalize to lower case and strip spaces
            true_organization_name = true_organization_name.lower().strip()
            organization_name = organization_name.lower().strip()
            
            # fuzzy matching score
            match_score = fuzz.partial_ratio(organization_name, true_organization_name)
            
            # consider prediction correct if the similarity score meets or exceeds the threshold
            if match_score >= 80:
                correct_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return f"Accuracy: {accuracy * 100:.2f}%"

In [None]:
accuracy_base(test, test_output)
fuzzy_accuracy(test, test_output)

#### Precision & recall

In [None]:
prediction_list = []
total_predictions = len(test_output)

# iterate over test_output
for text, organization_name in test_output:
    # check if any organization name is predicted
    if organization_name.strip(): 
        prediction_list.append(1)
    else:
        prediction_list.append(0)  

print("Predictions:", prediction_list)


Predictions: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# list with 1 if organization is present in the text, 0 otherwise
# see calculation of actuals in current_method notebook
actuals = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
def calculate_precision_recall(predictions, actuals):
    # Calculating TP, FP, and FN
    TP = sum(1 for actual, pred in zip(actuals, predictions) if actual == 1 and pred == 1)
    FP = sum(1 for actual, pred in zip(actuals, predictions) if actual == 0 and pred == 1)
    FN = sum(1 for actual, pred in zip(actuals, predictions) if actual == 1 and pred == 0)
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    return precision, recall

In [None]:
precision_gpt, recall_gpt = calculate_precision_recall(prediction_list, actuals)

print(f"Precision GPT pretr: {precision_gpt:.2f}")
print(f"Recall GPT pretr: {recall_gpt:.2f}")

#### Document-level accuracy
For comparison between models with paired t-tests. See usage in current_method notebook.

In [None]:
true_org_dict = test.set_index('Cleaned Text')['True Organization'].to_dict()

doc_acc = []
total_predictions = len(test_output)

# iterate over test_output and check if the text and organization name match the dataframe
for text, organization_name in test_output:
    # check if the text is in the dataframe
    if text[13:] in true_org_dict:
        true_organization_name = str(true_org_dict[text[13:]])
        # check if the organization name matches the true organization
        if true_organization_name.lower() == organization_name.lower():
            doc_acc.append(1)
        elif true_organization_name.lower() in organization_name.lower():
            doc_acc.append(1)
        elif organization_name.lower() in true_organization_name.lower():
            doc_acc.append(1)
        else:
            doc_acc.append(0)

print(doc_acc)

[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1]


## Finetuned Model

#### Create randomly selected train for finetuning GPT

In [None]:
def create_random_training_input(n, input_file):
    return random.sample(input_file, n)

train_final = create_random_training_input(100, train_data)

#### Write file into path

In [None]:
file_path = 'train_final.json'
with open(file_path, 'w') as file:
    for entry in train_final:
        json.dump(entry, file)
        file.write('\n')

print(f"Data written to {file_path}")

Data written to train_final.json


### Initialize fine tuning job

In [None]:
with open("train_final.json", "rb") as file:
    response = client.files.create(
        file=file,
        purpose="fine-tune"
    )

In [None]:
training_file_id = response.id

In [None]:
client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo"
)

FineTuningJob(id='ftjob-5ZUoTkUVXdYQ0xbDGmwtmIoT', created_at=1716382160, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-3V94kqhehiKkMJuT1WyKuNQK', result_files=[], status='validating_files', trained_tokens=None, training_file='file-ThJCuuHMne0QWYhYb7rWgLoJ', validation_file=None, user_provided_suffix=None, seed=882879571, estimated_finish=None, integrations=[])

In [None]:
# check progress
client.fine_tuning.jobs.retrieve("ftjob-5ZUoTkUVXdYQ0xbDGmwtmIoT")

FineTuningJob(id='ftjob-5ZUoTkUVXdYQ0xbDGmwtmIoT', created_at=1716382160, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::9RftY4Sw', finished_at=1716382926, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-3V94kqhehiKkMJuT1WyKuNQK', result_files=['file-VFt3giJLLumh6608V4CNRkrN'], status='succeeded', trained_tokens=684249, training_file='file-ThJCuuHMne0QWYhYb7rWgLoJ', validation_file=None, user_provided_suffix=None, seed=882879571, estimated_finish=None, integrations=[])

In [None]:
finetuned_id = "FINETUNEDID"

#### Create output

In [None]:
test_output = create_output(test_final, finetuned_id)

Generating output:   0%|          | 0/269 [00:00<?, ?it/s]

Generating output:  56%|█████▌    | 150/269 [01:42<01:08,  1.73it/s]

Error processing item 150: Error code: 400 - {'error': {'message': "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.", 'type': 'invalid_request_error', 'param': 'prompt', 'code': 'invalid_prompt'}}


Generating output: 100%|██████████| 269/269 [03:00<00:00,  1.49it/s]


In [None]:
fuzzy_accuracy(test, test_output)

True: stichting de bibliotheek utrecht, Pred: de bibliotheek utrecht
True: de nederlandsche bank, Pred: de nederlandsche bank
True: provincie limburg, Pred: provincie limburg
True: provincie noord-holland, Pred: provincie noord-holland
True: het fonds voor cultuurparticipatie, Pred: fonds voor cultuurparticipatie
True: hoogheemraadschap van rijnland, Pred: hoogheemraadschap van rijnland
True: kamer van koophandel, Pred: kamer van koophandel
True: omgevingsdienst noord-veluwwe, Pred: omgevingsdienst noordenveld westerkwartier
True: recreatieschap twiske-waterland, Pred: recreatieschap twiske-waterland
True: gemeente diemen, Pred: gemeente diemen
True: gemeente doesburg, Pred: gemeente doesburg
True: gemeente steenwijkerland en gemeente westerveld, Pred: gemeente steenwijkerland
True: stichting concert- en congresgebouw de doelen, Pred: de doelen
True: gemeente utrecht, Pred: gemeente utrecht
True: provincie flevoland, Pred: provincie flevoland
True: gemeente amsterdam, Pred: gemeente am

'Accuracy: 89.59%'

#### Precision & Recall

In [None]:
# Initialize the prediction list
prediction_list = []

# Total number of predictions made
total_predictions = len(test_output)

# Iterate over test_output
for text, organization_name in test_output:
    # Check if any organization name is predicted
    if organization_name.strip():  # Check if the prediction is not an empty string
        prediction_list.append(1)  # Append 1 if there's any prediction
    else:
        prediction_list.append(0)  # Append 0 if no prediction was made

# Display the prediction list
print("Predictions:", prediction_list)

In [None]:
precision_gpt, recall_gpt = calculate_precision_recall(prediction_list, actuals)

print(f"Precision GPT finetuned: {precision_gpt:.2f}")
print(f"Recall GPT finetuned: {recall_gpt:.2f}")

## Second Dataset

In [None]:
seconddata = pd.read_csv('final_seconddata.csv')

#### Get labeled format

In [None]:
second_labeled = get_labeled_format(seconddata)

Creating training data: 100%|██████████| 458/458 [00:00<00:00, 3657.18it/s]


In [None]:
second_labeled = [entry for entry in second_labeled if len(entry[1]['entities']) > 0]

In [None]:
second_gpt = get_gpt_format(second_labeled)

### Try base model

In [None]:
test_output = create_output(second_gpt, "gpt-3.5-turbo")

Generating output:   0%|          | 0/162 [00:00<?, ?it/s]

Generating output:  57%|█████▋    | 92/162 [02:38<03:05,  2.65s/it]

Error processing item 92: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 18466 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


Generating output: 100%|██████████| 162/162 [04:42<00:00,  1.75s/it]


#### Evaluation

In [None]:
fuzzy_accuracy(seconddata, test_output)

True: minafonds, Pred: minafonds
True: vlaamse auditautoriteit, Pred: vlaamse auditautoriteit
True: dienst van de bestuursrechtscolleges, Pred: dienst van de bestuursrechtscolleges
True: sport vlaanderen, Pred: sport vlaanderen
True: agentschap justitie en handhaving, Pred: agentschap justitie & handhaving
True: literatuur vlaanderen, Pred: literatuur vlaanderen
True: vito, Pred: vito - vlaamse instelling voor technologisch onderzoek
True: van de vlaamse adviescommissie voor volksraadplegingen, Pred: vlaamse adviescommissie voor volksraadplegingen
True: vlaamse belastingdienst, Pred: vlaamse belastingdienst
True: geschillencommissie groeipakket, Pred: geschillencommissie groeipakket
True: plantentuin meise, Pred: plantentuin meise
True: toerisme vlaanderen, Pred: toerisme vlaanderen
True: agentschap integratie en inburgering, Pred: agentschap integratie en inburgering
True: ovam, Pred: openbare vlaamse afvalstoffenmaatschappij
True: commissie van toezicht voor jeugdinstellingen, Pred: 

'Accuracy: 75.31%'

### Finetuned Model

In [None]:
test_output = create_output(second_gpt, finetuned_id)

Generating output:  57%|█████▋    | 92/162 [00:59<00:36,  1.90it/s]

Error processing item 92: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 18466 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


Generating output: 100%|██████████| 162/162 [01:42<00:00,  1.58it/s]


In [None]:
fuzzy_accuracy(seconddata, test_output)

True: minafonds, Pred: minafonds
True: vlaamse auditautoriteit, Pred: vlaamse auditautoriteit
True: dienst van de bestuursrechtscolleges, Pred: dienst van de bestuursrechtscolleges
True: sport vlaanderen, Pred: sport vlaanderen
True: agentschap justitie en handhaving, Pred: agentschap justitie & handhaving
True: literatuur vlaanderen, Pred: literatuur vlaanderen
True: vito, Pred: vito
True: van de vlaamse adviescommissie voor volksraadplegingen, Pred: ministerie van de vlaamse gemeenschap
True: vlaamse belastingdienst, Pred: vlaamse belastingdienst
True: geschillencommissie groeipakket, Pred: geschillencommissie groeipakket
True: plantentuin meise, Pred: plantentuin meise
True: toerisme vlaanderen, Pred: toerisme vlaanderen
True: agentschap integratie en inburgering, Pred: agentschap integratie en inburgering
True: ovam, Pred: openbare vlaamse afvalstoffenmaatschappij
True: commissie van toezicht voor jeugdinstellingen, Pred: commissie van toezicht voor jeugdinstellingen
True: inter, P

'Accuracy: 75.31%'