# Group Project CM1001 - Named Entity Recognition (NER)

Authors:  
Tong Li, 
Arafatul Islam

## Imports

In [5]:
from transformers import AutoTokenizer 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import fastparquet
import pyarrow
import tensorflow as tf
import keras
import torch
from transformers import AutoModelForTokenClassification
import accelerate
import _soundfile
from datasets import Dataset

## Load data

This section should load the raw dataset for the task.  
Remember to use relative paths to load any files in the notebook.

In [6]:
# Always use comments in the code to document specific steps
df1177 = pd.read_parquet("hf://datasets/community-datasets/swedish_medical_ner/1177/train-00000-of-00001.parquet")
dflt = pd.read_parquet("hf://datasets/community-datasets/swedish_medical_ner/lt/train-00000-of-00001.parquet")
dfwiki = pd.read_parquet("hf://datasets/community-datasets/swedish_medical_ner/wiki/train-00000-of-00001.parquet")
# Combine datasets
combined_df = pd.concat([df1177, dfwiki, dflt], ignore_index=True)
combined_df.head()

Unnamed: 0,sid,sentence,entities
0,1177_0,Memantin ( Ebixa ) ger sällan några biverkningar.,"{'start': [9], 'end': [18], 'text': ['Ebixa'],..."
1,1177_1,Det är också lättare att dosera [ flytande med...,"{'start': [32], 'end': [52], 'text': ['flytand..."
2,1177_2,( Förstoppning ) är ett vanligt problem hos äl...,"{'start': [0], 'end': [16], 'text': ['Förstopp..."
3,1177_3,[ Medicinen ] kan också göra att man blöder lä...,"{'start': [0, 74], 'end': [13, 85], 'text': ['..."
4,1177_4,Barn har större möjligheter att samarbeta om d...,"{'start': [], 'end': [], 'text': [], 'type': []}"


## Task 1: splitting data

This section should contain the solution of task 1.

It is mandatory to maintain the headings for each task.  
OPTIONALLY, you can use one level down (###) to organize subsessions of the assignments.

Use markdown cells like this one to include:
- Discussion points.
- References to specific sources of code that you might have used to solve the assignment.
- General commentas and explanations about your solution.

In [7]:
# Always use comments in the code to document specific steps
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Split into train (80%) and temp (20%)
train_df, temp_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Split temp into validation (50%) and test (50%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
def extract_ner_tags(sentence, entities):
    # Initialize a list of "O" (Outside) tags for each character in the sentence
    tags = ["O"] * len(sentence)
    
    # Check if the required keys exist in the entities dictionary
    if not all(key in entities for key in ["start", "end", "label"]):
        return tags  # Return all "O" tags if the keys are missing
    
    # Iterate through each entity
    for start, end, label in zip(entities["start"], entities["end"], entities["label"]):
        # Ensure start and end are within the sentence bounds
        start = max(0, min(start, len(sentence) - 1))
        end = max(0, min(end, len(sentence)))
        # Mark the entity span with the appropriate label
        for i in range(start, end):
            tags[i] = label
    
    return tags
# Apply the function to the DataFrame
train_df["ner_tags"] = train_df.apply(lambda row: extract_ner_tags(row["sentence"], row["entities"]), axis=1)
val_df["ner_tags"] = val_df.apply(lambda row: extract_ner_tags(row["sentence"], row["entities"]), axis=1)
test_df["ner_tags"] = test_df.apply(lambda row: extract_ner_tags(row["sentence"], row["entities"]), axis=1)
# Check the sizes
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")
print(train_df.columns)
print(train_df.head())

Train size: 636320
Validation size: 79540
Test size: 79540
Train size: 636320
Validation size: 79540
Test size: 79540
Index(['sid', 'sentence', 'entities', 'ner_tags'], dtype='object')
              sid                                           sentence  \
111057   lt_61410  s dock på grund av riskerna att (skada) benmär...   
548079  lt_498432  arat med klara regler för hur [läkemedel] skul...   
484227  lt_434580  mbilder(lakunära (syndrom), vanligast » pure m...   
254359  lt_204712  a GVHD vid allogen BMT. Behandlingen av (kroni...   
756819  lt_707172   insjuknande och död i (aids) kommer att öka i...   

                                                 entities  \
111057  {'start': [32], 'end': [39], 'text': ['skada']...   
548079  {'start': [30], 'end': [41], 'text': ['läkemed...   
484227  {'start': [17], 'end': [26], 'text': ['syndrom...   
254359  {'start': [40], 'end': [60], 'text': ['kronisk...   
756819  {'start': [23], 'end': [29], 'text': ['aids'],...   

               

## Task 2: Model choice and processing data

This section should contain the solution of task 2.

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Kb/bert-base-swedish-cased")
# Create a mapping from string labels to integers
label_map = {label: i for i, label in enumerate(set([tag for sublist in train_df["ner_tags"] for tag in sublist]))}

# tokenize_and_align_labels function
def tokenize_and_align_labels(df):
    tokenized_inputs = tokenizer(df["sentence"].tolist(), truncation=True, padding=True, max_length=512)
    labels = []
    
    for i, tags in enumerate(df["ner_tags"].tolist()):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token
            elif word_idx != previous_word_idx:
                # Convert string label to integer
                if isinstance(tags[word_idx], str):
                    label_ids.append(label_map[tags[word_idx]])
                else:
                    label_ids.append(tags[word_idx])
            else:
                label_ids.append(-100)  # Other subwords
                
            previous_word_idx = word_idx
            
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs
# Convert dictionary to Dataset object properly
tokenized_train_dict = tokenize_and_align_labels(train_df)
tokenized_val_dict = tokenize_and_align_labels(val_df)
tokenized_test_dict = tokenize_and_align_labels(test_df)

# Create Dataset objects
from datasets import Dataset
tokenized_train = Dataset.from_dict(tokenized_train_dict)
tokenized_val = Dataset.from_dict(tokenized_val_dict)
tokenized_test = Dataset.from_dict(tokenized_test_dict)

## Task 4: load the pretrained model

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Fix the model initialization
# We need the number of unique tags, not the length of tag list for a single example
num_labels = len(label_map)
model = AutoModelForTokenClassification.from_pretrained(
    "Kb/bert-base-swedish-cased",
    num_labels=num_labels
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = []
    true_labels = []
    
    for prediction, label in zip(predictions, labels):
        true_pred = []
        true_lab = []
        for p, l in zip(prediction, label):
            if l != -100:  # If it's not a special token
                true_pred.append(p)
                true_lab.append(l)
        true_predictions.extend(true_pred)
        true_labels.extend(true_lab)
    
    # Calculate metrics using sklearn
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_predictions, average='weighted', zero_division=0
    )
    acc = accuracy_score(true_labels, true_predictions)
    
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }



Some weights of BertForTokenClassification were not initialized from the model checkpoint at Kb/bert-base-swedish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Task 5 set up training arguement

In [19]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./swedish-ner-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


## Task 6 Train the model

In [20]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Task 7 Print Result

In [18]:
# Save the fine-tuned model
model_path = "./swedish-ner-final-model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Evaluate on test set
test_results = trainer.evaluate(tokenized_test)
print(f"Test results: {test_results}")

KeyboardInterrupt: 

This section should contain:
- Results.
- Summary of best model performance:
    - Name of best model file as saved in /models.
    - Relevant scores such as: accuracy, precision, recall, F1-score, etc.
- Key discussion points.

In [None]:
# Always use comments in the code to document specific steps