In [87]:
!pip list | grep "torch\|tensorflow\|transformers\|opencv\|scikit-learn"


opencv-contrib-python              4.10.0.84
opencv-python                      4.10.0.84
opencv-python-headless             4.10.0.84
pytorch-ignite                     0.5.1
pytorch-lightning                  2.5.0.post0
scikit-learn                       1.2.2
scikit-learn-intelex               2025.2.0
sentence-transformers              3.3.1
tensorflow                         2.17.1
tensorflow-cloud                   0.1.5
tensorflow-datasets                4.9.7
tensorflow_decision_forests        1.10.0
tensorflow-hub                     0.16.1
tensorflow-io                      0.37.1
tensorflow-io-gcs-filesystem       0.37.1
tensorflow-metadata                1.13.1
tensorflow-probability             0.24.0
tensorflow-text                    2.17.0
torch                              2.5.1+cu121
torchaudio                         2.5.1+cu121
torchinfo                          1.8.0
torchmetrics                       1.6.1
torchsummary                       1.5.1
torchtune       

In [88]:
!apt-get update
!apt-get install -y fonts-noto


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease         
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease                                          
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease                                              
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease                                    
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease                   
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/u

In [89]:
!pip install torch torchvision transformers opencv-python scikit-learn googletrans==4.0.0rc1 datasets seqeval evaluate



In [90]:
import pandas as pd

# Load the CSV file
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTyh006zrmSWoKAwqF92kZ4lvNRKsZL5NYIkdSEK3wtrWA8yMKLT6K3h1k1nyyHjG9ntZvc9tTcbUif/pub?output=csv'
df = pd.read_csv(url,encoding="utf-8")
df.head()


Unnamed: 0,id,caption,label,level,area,Unnamed: 5
0,1,গোমতী নদীর পানি উত্তর পাশে পালপাড়া অংশে বাঁধ থ...,flood,1.0,,
1,2,গোমতী নদীর পানি উত্তর পাশে পালপাড়া অংশে বাঁধ থ...,flood,1.0,,
2,3,কুমিল্লা,flood,1.0,কুমিল্লা,
3,4,আখাউড়া উপজেলা ও কসবা উপজেলা'র বিভিন্ন এলাকায় ই...,flood,1.0,আখাউড়া,
4,5,ফেনীর মুহুরী নদীতে পানির মাত্রা গত ৪০ বছরের ইত...,flood,1.0,ফেনী,


In [91]:
import random
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [92]:
# Drop unnecessary columns if they exist
columns_to_drop = [col for col in [' id','level','label', 'Unnamed: 5'] if col in df.columns]
df = df.drop(columns_to_drop, axis=1)

# Shuffle the dataframe
df = shuffle(df, random_state=42)

In [93]:
# Verify column names
print(df.columns)



Index(['caption', 'area'], dtype='object')


In [94]:
# Remove rows with blank or NaN 'area'
df = df[df["area"].notna() & (df["area"].str.strip() != "")]

In [95]:
import unicodedata

def normalize(word):
    # Keep only Bangla characters and basic punctuation (preserve diacritics)
    word = word.strip().lower()
    word = re.sub(r"[^\u0980-\u09FF]", "", word)  # keep only Bangla unicode range
    return unicodedata.normalize("NFC", word) 

def create_word_label_pairs(row):
    caption_words = [normalize(w) for w in row["caption"].split()]
    label_words = [normalize(a) for a in str(row["area"]).split(",")]
    
    pairs = []
    for word in caption_words:
        label = 1 if any(label_word in word for label_word in label_words) else 0
        pairs.append((word, label))
    return pairs

# Load your dataset
# Example: df = pd.read_csv("your_data.csv")
# Assuming you already have df with 'caption' and 'area'

all_word_label_pairs = []
for _, row in df.iterrows():
    all_word_label_pairs.extend(create_word_label_pairs(row))

# Create a DataFrame for model training
word_df = pd.DataFrame(all_word_label_pairs, columns=["word", "label"])

# --------------------------
# STEP 2: Feature extraction using character-level TF-IDF
# --------------------------

vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2, 4))
X = vectorizer.fit_transform(word_df["word"])
y = word_df["label"]


In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


In [97]:
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, digits=3))

Classification Report:
              precision    recall  f1-score   support

           0      0.966     0.993     0.979      3600
           1      0.910     0.656     0.762       369

    accuracy                          0.962      3969
   macro avg      0.938     0.825     0.871      3969
weighted avg      0.961     0.962     0.959      3969



In [98]:
def extract_locations_from_caption(caption):
    words = caption.strip().split()
    norm_words = [normalize(w) for w in words]
    X_words = vectorizer.transform(norm_words)
    preds = model.predict(X_words)
    locations = [word for word, pred in zip(words, preds) if pred == 1]
    return locations

# Step 1: Build a known location vocabulary from your dataset
known_places = set([normalize(name) for sublist in df['area'].dropna().apply(lambda x: str(x).split(',')) for name in sublist])

# Step 2: Improved suffix stripper
def strip_bangla_suffix(word):
    suffixes = ['ের', 'তে', 'য়ে', 'য়', 'র', 'ে', 'কে', 'এর', 'এরই']
    for suffix in sorted(suffixes, key=lambda x: -len(x)):
        if word.endswith(suffix):
            root = word[:-len(suffix)]
            if root in known_places:
                return root
            else:
                return word  # fallback to original form

    return word


In [99]:
import random

# --------------------------
# STEP 7: Evaluate on 5 random samples (unseen-style)
# --------------------------

# Sample 5 random captions from the original df
sample_rows = df.sample(n=5, random_state=42)

for i, row in sample_rows.iterrows():
    caption = row["caption"]
    true_locations = [strip_bangla_suffix(normalize(a)) for a in str(row["area"]).split(",")]
    
    predicted_locations = extract_locations_from_caption(caption)
    normalized_preds = [strip_bangla_suffix(normalize(p)) for p in predicted_locations]


    print(f"\n🔹 Caption {i+1}: {caption}")
    print(f"✅ True Locations: {true_locations}")
    print(f"🧠 Predicted Locations: {normalized_preds}")



🔹 Caption 3441: মিয়ানমারের প্রাচীন নগরী মান্দালয় ধ্বংসস্তূপে পরিণত হয়েছে
✅ True Locations: ['মিয়ানমার']
🧠 Predicted Locations: ['মিয়ানমার']

🔹 Caption 3305: জাপানে শক্তিশালী ভূমিকম্পের সর্বশেষ
✅ True Locations: ['জাপান']
🧠 Predicted Locations: ['জাপান']

🔹 Caption 3060: ঘূর্ণিঝড় ইয়াসের প্রভাবে থেমে থেমে ঝিরিঝিরি বৃষ্টি হচ্ছে সুন্দরবনসংলগ্ন উপকূলে। জোয়ারে কপোতাক্ষ নদের পানি ফুঁসছে। দশহালিয়া, কয়রা উপজেলা, খুলনা
✅ True Locations: ['খুলনা']
🧠 Predicted Locations: ['খুলনা']

🔹 Caption 84: এখন পর্যন্ত আপনাদের পাঠানো সাহায্যে বন্যা ত্রাণ ডিস্ট্রিবিউশন আপডেট: ৪ লক্ষ ৭১ হাজার ৫০০ টাকা (৪,৭১,৫০০ টাকা)
২৫/৮/২০২৪
১৯. লক্ষীপুর এর বন্যা পরিস্থিতি ক্রমাগত অবনতির জন্য মোঃ মেহেদী হাসান ছাব্বী র টিমের মাধ্যমে 
✅ True Locations: ['লক্ষীপুর']
🧠 Predicted Locations: ['লক্ষীপুর']

🔹 Caption 769: রাজশাহীতে লাইট চলে গেছে… সবাই দৌড়াদৌড়ি করছে।
✅ True Locations: ['রাজশাহী']
🧠 Predicted Locations: ['রাজশাহী']
