In [1]:
import pandas as pd
from transformers import BertForTokenClassification,BertTokenizerFast,pipeline
import jiwer
import re
from tqdm import tqdm
from jiwer import wer

In [2]:
model = BertForTokenClassification.from_pretrained('./results')
tokenizer = BertTokenizerFast.from_pretrained('./results')

In [3]:
model_checkpoint='./results'

token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple")

token_classifier




<transformers.pipelines.token_classification.TokenClassificationPipeline at 0x1bd87cb9f70>

## defining functions

In [4]:
def infer(sentences, token_classifier):
    pred_texts = [] 
    
    for sentence in tqdm(sentences, desc="Processing sentences"):
        ner_results = token_classifier(sentence)  
        sorted_entities = sorted([entity['word'] for entity in ner_results])
        grouped_entities = []
        current_entity = ""
        for i, word in enumerate(sorted_entities):
            if i > 0 and word.istitle() and current_entity:  # If the word starts a new entity
                grouped_entities.append(current_entity.strip())  # Add the current entity to the list
                current_entity = word  # Start a new entity
            else:
                current_entity += f" {word}"
        if current_entity: 
            grouped_entities.append(current_entity.strip())
        
        # Join each entity into a comma-separated string
        pred_texts_1 = ", ".join(grouped_entities)
        pred_texts.append(pred_texts_1)
    
    return pred_texts


In [5]:
def compute_wer(predicted_texts, reference_texts):
    wer_score = wer(reference_texts, predicted_texts)
    return wer_score

In [6]:
def preprocess_text(text):
    text = re.sub(r'@', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s,]','', text)
    return text


In [7]:
def clean_text(text):
    text = re.sub(r'\s*##\s*', '', text)
    text = re.sub(r'\s*\.\s*', '.', text)
    text = re.sub(r'^(a|at|be|s|u)\s+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(MaheshBabu)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(MONSTA)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(ososscate)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\b\w*(Fires|Floods|Earthquakes)\w*\b', '', text, flags=re.IGNORECASE)
    text = ' '.join(text.split())
    
    return text

In [8]:
def clean_locations(location_string):
    locations = [loc.strip() for loc in location_string.split(',')]
    seen = set()
    unique_locations = []
    for loc in locations:
        if loc not in seen:
            unique_locations.append(loc)
            seen.add(loc)
    
    return unique_locations

In [9]:
def flatten_and_combine(locations_list):
    combined_strings = []
    
    for sublist in locations_list:
        combined_string = " ".join(sublist)
        combined_strings.append(combined_string)
    
    return combined_strings

## Inference and prediction

In [10]:
data=pd.read_csv('Train_1.csv')
data.head(2)

Unnamed: 0,tweet_id,text,location
0,ID_1001136212718088192,,EllicottCity
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland


In [11]:
data1=data.dropna(subset=['text','location'])

In [12]:
text1=data1.text.to_list()

In [13]:
text1[:2]

['Flash floods struck a Maryland city on Sunday, washing out streets and tossing cars like bath toys.',
 'State of emergency declared for Maryland flooding:  via @YouTube']

In [14]:
text2 = [preprocess_text(text) for text in text1]

In [15]:
text2[:2]

['Flash floods struck a Maryland city on Sunday, washing out streets and tossing cars like bath toys',
 'State of emergency declared for Maryland flooding  via YouTube']

In [16]:
text3 = infer(text2,token_classifier)

Processing sentences: 100%|███████████████████████████████████████████████████████████████████| 11849/11849 [32:36<00:00,  6.06it/s]


In [17]:
clean_pred_texts = [clean_text(text) for text in text3]

In [18]:
#clean_pred_texts[:10]

In [19]:
cleaned_locations = [clean_locations(loc) for loc in clean_pred_texts]

In [20]:
#cleaned_locations[:3]

In [21]:
clean_preds = flatten_and_combine(cleaned_locations)

In [22]:
#clean_preds[5:]

In [23]:
true_texts=data1.location.to_list()

In [24]:
#true_texts[5:]

In [25]:
compute_wer(clean_preds,true_texts)
#0.10355546766857821 #0.08552501483092742

0.11696658097686376

In [26]:
test_csv=pd.read_csv('Test.csv')
test_text=test_csv.text.to_list()

In [27]:
test_text2 = [preprocess_text(text) for text in test_text]

In [28]:
pred_texts = infer(test_text2, token_classifier)

Processing sentences: 100%|█████████████████████████████████████████████████████████████████████| 2942/2942 [08:01<00:00,  6.11it/s]


In [29]:
pred_texts[:3]

['New England, New Orleans', 'MARYLAND', 'Ellicott City, Maryland']

In [30]:
clean_pred_texts = [clean_text(text) for text in pred_texts]
clean_pred_texts = ['nan' if loc == '' else loc for loc in clean_pred_texts]

In [31]:
cleaned_locations = [clean_locations(loc) for loc in clean_pred_texts]

In [32]:
clean_preds = flatten_and_combine(cleaned_locations)

In [33]:
len(clean_preds)

2942

In [34]:
clean_preds[:5]

['New England New Orleans',
 'MARYLAND',
 'Ellicott City Maryland',
 'Ellicott City Maryland Md',
 'Ellicott City Maryland']

In [35]:
results_df = pd.DataFrame({
    'tweet_id': test_csv['tweet_id'],
    'location': clean_preds
})

results_df.tail(5)

Unnamed: 0,tweet_id,location
2937,ID_915017703055749120,Mexico
2938,ID_915026957758328832,Las Vegas Mexico
2939,ID_915253441726889984,Calgary Mexico City
2940,ID_915971980859400192,Chiapas Mexicos Oaxaca
2941,ID_916099144116191232,Mexico


In [36]:
file_path = 'test_submission3.csv'
results_df.to_csv(file_path, index=False)