In [None]:
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
client = OpenAI(api_key=open('key.txt').read())

## Develop samples of training data and test data

In [None]:
# Open jsonl file, annotated with `data-development-code/1-annotate-with-gpt4o-and-split-training-samples.ipynb`

lang = "ca"

# Define file path
file_path = ""

df = pd.read_json("{}".format(file_path), orient="records", lines=True)


print(df.shape)

display(df.head())

### Create a smaller sample for annotation

I'll create two batches - first batch of 200 instances and second batch of 100 additional instances. Both have to be balanced by labels.

First batch: 12 instances per label -> 204 instances

Second batch: 6 instances per label -> 102 instances

In [None]:
label_dict = {'disaster, accident and emergency incident': 0,
 'human interest': 1,
 'politics': 2,
 'education': 3,
 'crime, law and justice': 4,
 'economy, business and finance': 5,
 'conflict, war and peace': 6,
 'arts, culture, entertainment and media': 7,
 'labour': 8,
 'weather': 9,
 'religion': 10,
 'society': 11,
 'health': 12,
 'environment': 13,
 'lifestyle and leisure': 14,
 'science and technology': 15,
 'sport': 16}

reverse_dict = {x[0]:x[1] for x in enumerate(list(label_dict.keys()))}

In [45]:
# Create the first batch

label_sample_size = 12

labels_list=list(label_dict.keys())

# First create the initial df to which all others in the loop will be added
final_sample = df[df["IPTC_pred"] == labels_list[0]].sample(n=label_sample_size)

# Add all other labels
remaining_list = labels_list[1:]

for i in remaining_list:
	# First get a number of remaining instances of this label
	label_instances_number = df[df["IPTC_pred"] == i].shape[0]
	# If there is less instances of this label than 12, take as many as there are
	if label_instances_number >= 12:
		try:
			added_instances = df[df["IPTC_pred"] == i].sample(n=label_sample_size)
			final_sample = pd.concat([final_sample, added_instances])
		except:
			print(df[df["IPTC_pred"] == i][:2].to_markdown())
	else:
		try:
			added_instances = df[df["IPTC_pred"] == i].sample(n=label_instances_number)
			final_sample = pd.concat([final_sample, added_instances])
		except:
			print(df[df["IPTC_pred"] == i][:2].to_markdown())

# Shuffle rows
final_sample = final_sample.sample(frac=1)

final_sample.head()

Unnamed: 0,document_id,text,genre,IPTC_pred
1261,21455,BRAINWASHED: One-third of Americans say conser...,News,politics
1569,28499,Almost one million young people are not in edu...,News,labour
1557,129304,Nocturne A storytelling show that explores the...,News,politics
1752,120192,"We focus on women’s rights in the workplace, r...",News,labour
571,177755,NEBRASKAland Associate Editor Jenny Nguyen-Whe...,News,lifestyle and leisure


In [46]:
final_sample.shape

(198, 4)

In [47]:
final_sample["IPTC_pred"].value_counts()

IPTC_pred
politics                                     12
labour                                       12
lifestyle and leisure                        12
environment                                  12
science and technology                       12
crime, law and justice                       12
human interest                               12
arts, culture, entertainment and media       12
disaster, accident and emergency incident    12
religion                                     12
health                                       12
economy, business and finance                12
society                                      12
education                                    12
sport                                        12
conflict, war and peace                      12
weather                                       6
Name: count, dtype: int64

In [None]:
# Save the sample

final_sample.to_csv("{}-for-annotation.csv".format(file_path))

#### Second sample



In [49]:
# Get a list of ids in the first batch
first_batch_ids = final_sample["document_id"].to_list()

first_batch = []

for doc in df["document_id"].to_list():
	if doc in first_batch_ids:
		first_batch.append("yes")
	else:
		first_batch.append("no")

df["first_batch"] = first_batch

print(df["first_batch"].value_counts())

df.head(2)

first_batch
no     1802
yes     198
Name: count, dtype: int64


Unnamed: 0,document_id,text,genre,IPTC_pred,first_batch
0,89787,A fast food meal doesn’t have to be unhealthy....,News,health,no
1,116609,WE WILL NOT BE BEAT PERIOD!* *Consumer must pr...,News,education,no


In [50]:
# Now filter out instances from the first batch

df_wth_first_batch = df[df["first_batch"] == "no"]

df_wth_first_batch.shape

(1802, 5)

In [51]:
# Create the second batch

label_sample_size = 6

labels_list=list(label_dict.keys())

# First create the initial df to which all others in the loop will be added
second_batch = df_wth_first_batch[df_wth_first_batch["IPTC_pred"] == labels_list[0]].sample(n=label_sample_size)

# Add all other labels
remaining_list = labels_list[1:]

for i in remaining_list:
	# First get a number of remaining instances of this label
	label_instances_number = df_wth_first_batch[df_wth_first_batch["IPTC_pred"] == i].shape[0]
	# If there is less instances of this label than 6, take as many as there are
	if label_instances_number >= 6:
		try:
			added_instances = df_wth_first_batch[df_wth_first_batch["IPTC_pred"] == i].sample(n=label_sample_size)
			second_batch = pd.concat([second_batch, added_instances])
		except:
			print(df_wth_first_batch[df_wth_first_batch["IPTC_pred"] == i][:2].to_markdown())
	elif label_instances_number == 0:
		continue
	else:
		try:
			added_instances = df_wth_first_batch[df_wth_first_batch["IPTC_pred"] == i].sample(n=label_instances_number)
			second_batch = pd.concat([second_batch, added_instances])
		except:
			print(df_wth_first_batch[df_wth_first_batch["IPTC_pred"] == i][:2].to_markdown())

# Shuffle rows
second_batch = second_batch.sample(frac=1)

print(second_batch.shape)

second_batch.head()

(96, 5)


Unnamed: 0,document_id,text,genre,IPTC_pred,first_batch
608,77745,THE PHILIPPINE National Police is continuously...,News,politics,no
975,183886,We have all been there – you push your body to...,News,lifestyle and leisure,no
1504,165144,"Articles, stories, essays, editorials, news an...",News,environment,no
713,75322,"Monday, June 10 is National Iced Tea Day. In h...",News,"crime, law and justice",no
1480,75999,Govt Jobs in District Health Authority Sargodh...,News,labour,no


In [52]:
# Remove the first_batch column
second_batch.drop(columns=['first_batch'], inplace=True)
second_batch.head(2)

Unnamed: 0,document_id,text,genre,IPTC_pred
608,77745,THE PHILIPPINE National Police is continuously...,News,politics
975,183886,We have all been there – you push your body to...,News,lifestyle and leisure


In [53]:
second_batch["IPTC_pred"].value_counts()

IPTC_pred
politics                                     6
lifestyle and leisure                        6
environment                                  6
crime, law and justice                       6
labour                                       6
society                                      6
religion                                     6
education                                    6
sport                                        6
disaster, accident and emergency incident    6
human interest                               6
conflict, war and peace                      6
health                                       6
arts, culture, entertainment and media       6
science and technology                       6
economy, business and finance                6
Name: count, dtype: int64

In [None]:
# Save the sample
second_batch.to_csv("{}-for-additional-annotation.csv".format(file_path))