In [1]:
#!pip install -q jiwer

## importing libraries

In [2]:
import spacy
import torch
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
from tqdm import tqdm
from jiwer import wer
import re
import random
from transformers import pipeline
import pandas as pd






## loading dataset

In [3]:
train=load_dataset('json', data_files='train.json')
train

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text', 'location_mentions'],
        num_rows: 14392
    })
})

In [4]:
val=load_dataset('json', data_files='val.json')
val

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text', 'location_mentions'],
        num_rows: 2056
    })
})

In [5]:
#test=load_dataset('json', data_files='/kaggle/input/lmr-dataset1/test.json')
#test

In [6]:
train_df = train['train'].to_pandas()
train_df.head(5)

Unnamed: 0,tweet_id,text,location_mentions
0,1061497252806414336,Please read below!! Another devastating fire h...,"[{'end_offset': 72, 'start_offset': 53, 'text'..."
1,1061165982855634944,Celebrities evacuate California as wildfires rage,"[{'end_offset': 31, 'start_offset': 21, 'text'..."
2,1064248785914732544,Grab one of these and HELP victims of Californ...,"[{'end_offset': 48, 'start_offset': 38, 'text'..."
3,1061160485364613121,the camp fire in Paradise California is growin...,"[{'end_offset': 25, 'start_offset': 17, 'text'..."
4,1066508972650311682,The area of documented destruction in the #Cam...,[]


In [7]:
val_df = val['train'].to_pandas()
val_df.head(5)

Unnamed: 0,tweet_id,text,location_mentions
0,1065347154078773250,If youre looking for legitimate relief organiz...,"[{'end_offset': 85, 'start_offset': 83, 'text'..."
1,1065256190727282689,"Like so many things, the destruction of Califo...","[{'end_offset': 51, 'start_offset': 40, 'text'..."
2,1066000788177211393,Officials say 563 people are still unaccounted...,[]
3,1064315981365862400,"The wait for Paradise: From young to old, evac...","[{'end_offset': 191, 'start_offset': 186, 'tex..."
4,1067241739843325952,BREAKING: Camp Fire death toll increases to 88...,[]


In [8]:
#confirming whether the empty locations mentions have no location mentioned in the text
train_df.text[0],val_df.text[0]

('Please read below!! Another devastating fire has hit Northern California, people need help, whatever you can give, or anyway you can help, please doὤF!!',
 'If youre looking for legitimate relief organizations to help those affected by the CA fires, I found this link: How to Help Those Affected by California Wildfires - Consumer Reports')

In [9]:
# remove the missing values
train_df= train_df[train_df['location_mentions'].apply(lambda x: len(x) > 0)]


In [10]:
len(train_df)

10366

In [11]:
val_df= val_df[val_df['location_mentions'].apply(lambda x: len(x) > 0)]
len(val_df)

1483

In [12]:
train_df.text[1],train_df.location_mentions[1]

('Celebrities evacuate California as wildfires rage',
 array([{'end_offset': 31, 'start_offset': 21, 'text': 'California'}],
       dtype=object))

In [13]:
## sort the locations
def sort_locations(locations):
    # Sort the list of dictionaries by the 'text' key
    return sorted(locations, key=lambda x: x['text'])

In [14]:
#train_df['location_mentions'] = train_df['location_mentions'].apply(sort_locations)
#len(train_df)

In [15]:
#val_df['location_mentions'] = val_df['location_mentions'].apply(sort_locations)
#len(val_df)

In [16]:
train_df.location_mentions[3]

array([{'end_offset': 25, 'start_offset': 17, 'text': 'Paradise'},
       {'end_offset': 36, 'start_offset': 26, 'text': 'California'}],
      dtype=object)

In [17]:
data = pd.concat([train_df, val_df], ignore_index=True, axis=0)

In [18]:
len(data)

11849

In [19]:
data.head(2)

Unnamed: 0,tweet_id,text,location_mentions
0,1061497252806414336,Please read below!! Another devastating fire h...,"[{'end_offset': 72, 'start_offset': 53, 'text'..."
1,1061165982855634944,Celebrities evacuate California as wildfires rage,"[{'end_offset': 31, 'start_offset': 21, 'text'..."


In [20]:
data1 = data.sample(frac=1).reset_index(drop=True)
data1.head(2)

Unnamed: 0,tweet_id,text,location_mentions
0,721576641936408576,car & Overpass #EARTHQUAKE in Ecuador upgraded...,"[{'end_offset': 37, 'start_offset': 30, 'text'..."
1,908449017134231553,7th grade. Learning about earthquakes with Goo...,"[{'end_offset': 101, 'start_offset': 96, 'text..."


## Text preprocessing

In [21]:
nlp=spacy.load('en_core_web_sm')

In [22]:
doc=nlp('RAW: Aerial view of flooding damage in Nebraska  via @YouTube')
doc

RAW: Aerial view of flooding damage in Nebraska  via @YouTube

In [23]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [24]:
## convert dataframes to dict
train_data = data1.to_dict(orient='records')
val_data = val_df.to_dict(orient='records')

In [25]:
train_data[0:2]

[{'tweet_id': '721576641936408576',
  'text': 'car & Overpass #EARTHQUAKE in Ecuador upgraded to 7.8 magnituden .2016 #Manabí #ecuador #BREAKING #earthquake',
  'location_mentions': array([{'end_offset': 37, 'start_offset': 30, 'text': 'Ecuador'},
         {'end_offset': 78, 'start_offset': 72, 'text': 'Manabí'},
         {'end_offset': 87, 'start_offset': 80, 'text': 'ecuador'}],
        dtype=object)},
 {'tweet_id': '908449017134231553',
  'text': '7th grade. Learning about earthquakes with Google Cardboards and touring earthquake damage from Japan and Nepal. #rockdalepride',
  'location_mentions': array([{'end_offset': 101, 'start_offset': 96, 'text': 'Japan'},
         {'end_offset': 111, 'start_offset': 106, 'text': 'Nepal'}],
        dtype=object)}]

In [26]:
# Sample data
data = train_data[0:3]

# Extract location mentions
for tweet in data:
    text = tweet['text']
    doc = nlp(text)
    extracted_locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    print(f"Text: {text}")
    print(f"Extracted Locations: {extracted_locations}")

Text: car & Overpass #EARTHQUAKE in Ecuador upgraded to 7.8 magnituden .2016 #Manabí #ecuador #BREAKING #earthquake
Extracted Locations: ['Ecuador', 'Manabí']
Text: 7th grade. Learning about earthquakes with Google Cardboards and touring earthquake damage from Japan and Nepal. #rockdalepride
Extracted Locations: ['Japan', 'Nepal']
Text: 1news reports that TSB Arena & BNZ centre on the waterfront has sustained most damage #Wellington #eqnz
Extracted Locations: ['Wellington']


some locations are not recognized as GPE entity

In [27]:
val_data[0]

{'tweet_id': '1065347154078773250',
 'text': 'If youre looking for legitimate relief organizations to help those affected by the CA fires, I found this link: How to Help Those Affected by California Wildfires - Consumer Reports',
 'location_mentions': array([{'end_offset': 85, 'start_offset': 83, 'text': 'CA'},
        {'end_offset': 152, 'start_offset': 142, 'text': 'California'}],
       dtype=object)}

In [28]:
train_ds=[]
for data in train_data:
    temp_dict={}
    temp_dict['text'] = data['text']
    temp_dict['entities'] = []
    for annotation in data['location_mentions']:
        start = annotation['start_offset']
        end = annotation['end_offset']
        label = "LOCATION"
        text = annotation['text']
        temp_dict['entities'].append((start,end,label,text))
    train_ds.append(temp_dict)

train_ds[0:3]
    
    

[{'text': 'car & Overpass #EARTHQUAKE in Ecuador upgraded to 7.8 magnituden .2016 #Manabí #ecuador #BREAKING #earthquake',
  'entities': [(30, 37, 'LOCATION', 'Ecuador'),
   (72, 78, 'LOCATION', 'Manabí'),
   (80, 87, 'LOCATION', 'ecuador')]},
 {'text': '7th grade. Learning about earthquakes with Google Cardboards and touring earthquake damage from Japan and Nepal. #rockdalepride',
  'entities': [(96, 101, 'LOCATION', 'Japan'),
   (106, 111, 'LOCATION', 'Nepal')]},
 {'text': '1news reports that TSB Arena & BNZ centre on the waterfront has sustained most damage #Wellington #eqnz',
  'entities': [(87, 97, 'LOCATION', 'Wellington')]}]

In [29]:
val_ds=[]
for data in val_data:
    temp_dict={}
    temp_dict['text'] = data['text']
    temp_dict['entities'] = []
    for annotation in data['location_mentions']:
        start = annotation['start_offset']
        end = annotation['end_offset']
        label = "LOCATION"
        text = annotation['text']
        temp_dict['entities'].append((start,end,label,text))
    val_ds.append(temp_dict)

val_ds[0:3]

[{'text': 'If youre looking for legitimate relief organizations to help those affected by the CA fires, I found this link: How to Help Those Affected by California Wildfires - Consumer Reports',
  'entities': [(83, 85, 'LOCATION', 'CA'),
   (142, 152, 'LOCATION', 'California')]},
 {'text': 'Like so many things, the destruction of Californias massive Camp fire was less natural, more man made. The Camp fire burned homes but left trees standing.',
  'entities': [(40, 51, 'LOCATION', 'Californias')]},
 {'text': 'The wait for Paradise: From young to old, evacuees displaced by the #campfire endure another restless & cold night in their cars, container trucks & tents, at the Walmart parking lot in Chico, Calif.',
  'entities': [(186, 191, 'LOCATION', 'Chico'),
   (193, 199, 'LOCATION', 'Calif.')]}]

### Tokenization

In [30]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')



In [31]:
def tokenize_and_align_labels(text, entities):
    tokenized_inputs = tokenizer(text, return_offsets_mapping=True, padding=True, truncation=True)
    labels = [0] * len(tokenized_inputs["input_ids"])  # Default to O label
    for start, end, label, text in entities:
        # Get the tokens that fall within the entity span
        for i, (offset_start, offset_end) in enumerate(tokenized_inputs["offset_mapping"]):
            if offset_start == start:
                labels[i] = 1  # B-LOCATION
            elif offset_start > start and offset_end <= end:
                labels[i] = 2  # I-LOCATION

    # Mask special tokens ([CLS], [SEP]) with -100
    labels[0] = -100  # [CLS]
    labels[-1] = -100  # [SEP]

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": labels
    }



In [32]:
train_ds1 = []
for example in train_ds:
    tokenized_example = tokenize_and_align_labels(example['text'], example['entities'])
    train_ds1.append(tokenized_example)

In [33]:
train_ds[3]

{'text': 'MALDIVES DONATES USD 50,000 FOR KERALA RELIEF Token contribution in solidarity with the people of India” Maldives President’s Office. #KeralaSOS #KeralaFloods @MDVinIND @SushmaSwaraj',
 'entities': [(0, 8, 'LOCATION', 'MALDIVES'),
  (32, 38, 'LOCATION', 'KERALA'),
  (98, 103, 'LOCATION', 'India'),
  (105, 113, 'LOCATION', 'Maldives')]}

In [34]:
# train_ds1[0]

In [35]:
val_ds1 = []
for example in val_ds:
    tokenized_example = tokenize_and_align_labels(example['text'], example['entities'])
    val_ds1.append(tokenized_example)

In [36]:
#train_ds1[0]

In [37]:
# Define label mappings
id2label = {
    0: 'O',           
    1: 'B-LOCATION',  
    2: 'I-LOCATION', 
    -100: 'IGNORE'    
}

label2id = {v: k for k, v in id2label.items()}


In [38]:
len(label2id)

4

## Training the model

In [39]:
model = BertForTokenClassification.from_pretrained(
   'bert-base-cased',
    num_labels=len(label2id), 
    label2id=label2id,         
    id2label=id2label           
) 


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    flattened_labels = labels.flatten()
    flattened_preds = preds.flatten()
    valid_indices = flattened_labels != -100
    filtered_labels = flattened_labels[valid_indices]
    filtered_preds = flattened_preds[valid_indices]
    
    # Handle case where there are no valid labels after filtering
    if len(filtered_labels) == 0:
        return {
            'accuracy': 0.0,
            'f1': 0.0,
            'precision': 0.0,
            'recall': 0.0
        }
    
    precision, recall, f1, _ = precision_recall_fscore_support(filtered_labels, filtered_preds, average='weighted')
    
    # Compute accuracy
    acc = accuracy_score(filtered_labels, filtered_preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [41]:
#data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer,label_pad_token_id=-100)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [42]:
training_args = TrainingArguments(
    output_dir='./model',
    num_train_epochs=3,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=70,
    evaluation_strategy="steps",
    eval_steps=70,
    save_steps=70,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=2,
    learning_rate=6e-5,
    lr_scheduler_type='cosine',
    gradient_accumulation_steps=5,  
    fp16=False, 
    report_to=["none"],
    max_grad_norm=1.0,
    seed=42
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds1,
    eval_dataset=val_ds1,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)




In [43]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
70,0.1386,0.065173,0.978314,0.977488,0.977652,0.978314
140,0.0627,0.048312,0.982278,0.982239,0.982221,0.982278
210,0.0554,0.042034,0.98441,0.984332,0.984278,0.98441
280,0.0558,0.037319,0.987147,0.987221,0.987326,0.987147
350,0.0453,0.036026,0.986335,0.98606,0.986018,0.986335
420,0.048,0.033773,0.98794,0.987922,0.987913,0.98794
490,0.0513,0.029237,0.990582,0.990572,0.990574,0.990582
560,0.0344,0.025263,0.991394,0.991365,0.991345,0.991394
630,0.032,0.023748,0.992035,0.992077,0.992138,0.992035
700,0.0294,0.020821,0.992451,0.992494,0.992566,0.992451


TrainOutput(global_step=1422, training_loss=0.03800686841775596, metrics={'train_runtime': 20737.5684, 'train_samples_per_second': 1.714, 'train_steps_per_second': 0.069, 'total_flos': 1069917799001976.0, 'train_loss': 0.03800686841775596, 'epoch': 3.0})

In [44]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.011307409964501858, 'eval_accuracy': 0.9961309074437567, 'eval_f1': 0.9961473056876373, 'eval_precision': 0.9961753184205558, 'eval_recall': 0.9961309074437567, 'eval_runtime': 173.2398, 'eval_samples_per_second': 8.56, 'eval_steps_per_second': 1.714, 'epoch': 3.0}


## save the model

In [45]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.txt',
 './results\\added_tokens.json',
 './results\\tokenizer.json')