In [1]:
import pandas as pd
from datasets import Dataset

# Load your dataset
df = pd.read_csv('balanced2.csv')

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# You can define your train-test split here if needed
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [2]:
from transformers import AutoTokenizer

model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples['line'], padding='max_length', truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

In [3]:
from sklearn.preprocessing import LabelEncoder
import pickle

# Creating label encoders for level1 and level2
label_encoder_level1 = LabelEncoder()
label_encoder_level2 = LabelEncoder()

# Encoding level1 and level2 labels
df['encoded_level1'] = label_encoder_level1.fit_transform(df['level1'])
df['encoded_level2'] = label_encoder_level2.fit_transform(df['level2'])

# Save the label encoders for later use
with open('label_encoder_level1.pkl', 'wb') as file:
    pickle.dump(label_encoder_level1, file)

with open('label_encoder_level2.pkl', 'wb') as file:
    pickle.dump(label_encoder_level2, file)


In [4]:
# Creating data frames for level1 and level2 label classes
label_classes_level1 = label_encoder_level1.classes_
df_label_classes_level1 = pd.DataFrame(label_classes_level1, columns=["Level1 Label Names"])
df_label_classes_level1.index.name = "Encoded Level1 Label"
df_label_classes_level1.reset_index(inplace=True)

label_classes_level2 = label_encoder_level2.classes_
df_label_classes_level2 = pd.DataFrame(label_classes_level2, columns=["Level2 Label Names"])
df_label_classes_level2.index.name = "Encoded Level2 Label"
df_label_classes_level2.reset_index(inplace=True)


In [5]:
df.head()

Unnamed: 0,line,level1,level2,encoded_level1,encoded_level2
0,General Maintenance Works at State House Nairobi,General public services,"Executive and legislative organs, financial an...",4,8
1,General Maintenance Works at State House Sagana,General public services,"Executive and legislative organs, financial an...",4,8
2,Refurbishment of buildings at Mombasa State House,General public services,"Executive and legislative organs, financial an...",4,8
3,Refurbishment of buildings at Nakuru State House,General public services,"Executive and legislative organs, financial an...",4,8
4,General Works at the Office of the Deputy Pres...,General public services,"Executive and legislative organs, financial an...",4,8


In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [7]:
# Ensure the 'encoded_level1' column is included in the datasets
train_dataset = train_dataset.map(lambda e: {'encoded_level1': e['encoded_level1']})
eval_dataset = eval_dataset.map(lambda e: {'encoded_level1': e['encoded_level1']})

# Tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Add the 'encoded_level1' as labels in the tokenized dataset
tokenized_train_dataset = tokenized_train_dataset.map(lambda e: {'labels': e['encoded_level1']})
tokenized_eval_dataset = tokenized_eval_dataset.map(lambda e: {'labels': e['encoded_level1']})

# Set the format for PyTorch
tokenized_train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

In [8]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}
def compute_metrics_level2(pred):
    labels_level2 = pred.label_ids  # These are the true level2 labels
    preds_level2 = pred.predictions.argmax(-1)  # These are the predicted level2 labels
    acc_level2 = accuracy_score(labels_level2, preds_level2)
    return {'accuracy_level2': acc_level2}


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"
import warnings
warnings.filterwarnings("ignore")

# Update num_labels to reflect the number of unique labels in level1
num_labels_level1 = len(label_encoder_level1.classes_)
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels_level1)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Adjust as needed
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",  # Make save strategy match the evaluation strategy
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,  # Use the compute_metrics function for level1
)

trainer.train()


2023-12-13 16:06:23.422324: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.082498,0.398089
2,No log,1.432583,0.684713
3,No log,0.779877,0.815287
4,No log,0.481315,0.869427
5,No log,0.437809,0.894904


TrainOutput(global_step=395, training_loss=1.1993534909018988, metrics={'train_runtime': 5536.4638, 'train_samples_per_second': 1.134, 'train_steps_per_second': 0.071, 'total_flos': 1652456113029120.0, 'train_loss': 1.1993534909018988, 'epoch': 5.0})

In [11]:
# Ensure the 'encoded_level1' column is included in the datasets
train_dataset2 = train_dataset.map(lambda e: {'encoded_level2': e['encoded_level2']})
eval_dataset2 = eval_dataset.map(lambda e: {'encoded_level2': e['encoded_level2']})

# Tokenization
tokenized_train_dataset2 = train_dataset2.map(tokenize_function, batched=True)
tokenized_eval_dataset2 = eval_dataset2.map(tokenize_function, batched=True)

# Add the 'encoded_level1' as labels in the tokenized dataset
tokenized_train_dataset2 = tokenized_train_dataset2.map(lambda e: {'labels': e['encoded_level2']})
tokenized_eval_dataset2 = tokenized_eval_dataset2.map(lambda e: {'labels': e['encoded_level2']})

# Set the format for PyTorch
tokenized_train_dataset2.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval_dataset2.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

Map:   0%|          | 0/1256 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Assuming you have already predicted level1 labels and prepared the data for level2
# Update num_labels to reflect the number of unique labels in level2
num_labels_level2 = len(label_encoder_level2.classes_)
model_level2 = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels_level2)

# You may use similar training arguments, but ensure to differentiate the output directory
training_args_level2 = TrainingArguments(
    output_dir='./results_level2',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs_level2',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy_level2',
    report_to="none",
)

# Tokenize your modified dataset for level2 and then create a Trainer instance
# Assuming tokenized_train_dataset_level2 and tokenized_eval_dataset_level2 are ready
trainer_level2 = Trainer(
    model=model_level2,
    args=training_args_level2,
    train_dataset=tokenized_train_dataset2,
    eval_dataset=tokenized_eval_dataset2,
    compute_metrics=compute_metrics_level2,  # Adjust compute_metrics if needed for level2
)

trainer_level2.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy Level2
1,No log,3.965576,0.031847
2,No log,3.698133,0.136943
3,No log,3.17941,0.257962
