In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_path = '/content/drive/MyDrive/Models/crf-masked/xlmroberta-multitask-tokenclf-withCRF-nomisc'

In [None]:
import pandas as pd
import random
import numpy as np
import torch

# Set the seed value (e.g., 42)
seed_value = 64

# Seed the random number generators
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)  # If using CUDA
torch.cuda.manual_seed_all(seed_value)  # If using multiple GPUs

In [None]:
# # Example label mappings
pos_label2id = {
    'CC' : 0,  # Coordinating conjunction
    'CD' : 1,  # Cardinal number
    'OD' : 2,  # Ordinal Number
    'DT' : 3,  # Determiner
    'FW' : 4,  # Foreign word
    'IN' : 5,  # Preposition
    'JJ' : 6, # Adjective
    'MD' : 7, # Modal
    'NEG' : 8, # Negation
    'NN' : 9, # Noun
    'NNP' : 10, # Proper Noun
    'NND' : 11, # Classifier
    'PR' : 12, # Demonstrative Pronoun
    'PRP' : 13, # Personal Pronoun
    'RB' : 14, # Adverb
    'RP' : 15, # Particle
    'SC' : 16, # Subordinating Conjunction
    'SYM' : 17, # Symbol
    'UH' : 18, # Interjection
    'VB' : 19, # Verb
    'WH': 20, # Question
    'Z' : 21 # Punctuation
}

ner_label2id = {
    'O': 0,      # Outside (non-entity)
    'B-LOC': 1,  # Beginning of a location entity
    'I-LOC': 2,  # Inside a location entity
    'B-PERSON': 3,  # Beginning of a person entity
    'I-PERSON': 4,  # Inside a person entity
    'B-ORG': 5,  # Beginning of an organization entity
    'I-ORG': 6,  # Inside an organization entity
    'B-EVENT': 7, # Beginning of a miscellaneous entity
    'I-EVENT': 8,  # Inside a miscellaneous
    'B-GPE': 9,  # Beginning of a geopolitical entity
    'I-GPE': 10,  # Inside a geopolitical entity
    'PAD': 11     # Padding token
}

In [None]:
pos_id2label = {v: k for k, v in pos_label2id.items()}
ner_id2label = {v: k for k, v in ner_label2id.items()}

# 2. Data

In [None]:
full_dataset = pd.read_csv('/content/drive/MyDrive/Datasets/Skripsi/jakarta/jakarta_normalized.csv')

# full_dataset = full_dataset[920:950]

import ast

for col in full_dataset.columns:
    full_dataset[col] = full_dataset[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [None]:
# full_dataset = pd.DataFrame([
#     { 'tokens': ['hari', 'ini', 'sty', 'bergema', 'di', 'sydney', 'football', 'stadium', 'tanggal', '25', 'akan', 'bergema', 'di', 'gbk', 'sepertinya', '.'] },
#     { 'tokens': ['potongan', 'video', 'terakhir', 'paus', 'fransiskus', 'saat', 'menyapa', 'masyarakat', 'di', 'perayaan', 'paskah', 'kemarin'] },
#     { 'tokens': ['salah', 'satu', 'hotel', 'di', 'jogja', 'yang', 'bernuansa', 'eropa', 'klasik', 'dengan', 'berbagai', 'lukisan', 'dan', 'furniture', 'khas', 'eropa', 'menurutku', 'ini', 'bisa', 'jadi', 'salah', 'satu', 'rekomendasi', 'hotel', 'kalau', 'kalian', 'liburan', 'ke', 'jogja', '.', 'untuk', 'harga', 'menurutku', 'affordable', 'dan', 'lokasi', 'strategis', 'dari', 'stasiun', 'tugu', 'malioboro', 'dan'] },
#     { 'tokens': ['di', 'jalur', 'pantura', 'jalan', '.', 'raya', 'tegal', '-', 'pemalang', 'masih', 'banyak', 'di', 'lintasi', 'truk-truk', 'besar', 'dan', 'rata', 'rata', 'kendaraan', 'yang', 'berjalan', 'lambat', 'ada', 'di', 'lajur', 'kanan', 'sehingga', 'kita', 'terpaksa', 'menyalip', 'dari', 'kiri', '.', 'kondisi', 'jalan', 'mulus', '.', 'persembahan', '@khongguan_id'] },
#     { 'tokens': ['makanya', 'demo', 'di', 'era', 'sekarang', 'ini', 'harusnya', 'udah', 'bergeser', 'ke', 'menduduki', 'jalanan', 'utama', 'yang', 'dimana', 'bisa', 'berpengaruh', 'dengan', 'perekonomian', 'negara', '.', 'demo', 'di', 'hongkong', '(', '2019', ')', 'udah', 'seperti', 'itu', 'mereka', 'menduduki', 'semua', 'jalanan', 'utama', 'dan', 'menutup', 'akses', 'stasiun', '.', 'nyata', 'berpengaruh', 'ekonomi', 'negara', 'lumpuh', '.'] },
#     { 'tokens': ['percepatan', 'waktu', 'tempuh', 'ini', 'berkat', 'peningkatan', 'prasarana', 'perkeretaapian', 'dan', 'pengurangan', 'stasiun', 'perhentian', '.', 'kini', 'ka', 'argo', 'bromo', 'anggrek', 'hanya', 'berhenti', 'di', 'stasiun', 'cirebon', 'dan', 'semarang', 'tawang', 'bank', 'jateng', 'saja', 'guys', '!', '️', 'jadi', 'siapa', 'yang', 'mudiknya', 'nanti', 'naik', 'sang', 'raja', 'utara', 'ini', '?', 'coba'] },
#     { 'tokens': ['sebelum', 'subuh', 'kakinya', 'sudah', 'menapaki', 'stasiun', 'mengejar', 'krl', 'pertama', '.', 'bocah', 'lelaki', 'berdarah', 'aceh', 'dan', 'batak', 'ini', 'melangkah', 'gagah', 'melawan', 'kantuk', 'demi', 'bisa', 'sekolah', '.', 'kak', 'rini', 'uwanya', 'sering', 'menemaninya', 'karena', 'ibunya', 'sakit', '.'] },
#     { 'tokens': ['@masbro_back', 'kejadiannya', 'disini', 'jalan', '.', 'm', '.', 't', '.', 'haryono', 'seberang', 'stasiun', 'lrt', 'ciliwung', '.', 'video', 'sepertinya', 'diambil', 'dr', 'gedung', 'waskita', 'rajawali', 'tower'] },
#     { 'tokens': ['komisi', 'vi', 'dpr', 'republik indonesia', 'menegaskan', 'bahwa', 'bumn', 'sektor', 'transportasi', 'darat', 'akan', 'menghadapi', 'tantangan', 'besar', 'akibat', 'lonjakan', 'jumlah', 'pemudik', 'saat', 'kunjungan', 'ke', 'stasiun', 'kereta', 'cepat', 'tegalluar', 'summarecon', 'kabupaten', 'bandung', 'jawa', 'barat', 'jumat', '(', '21/3', '/', '2025', ')', '.'] },
#   ]
# )

In [None]:
#Check if Length of Tokens doesnt match the tokens

# for index, row in full_dataset.iterrows():
#   if len(row['tokens']) != len(row['pos_labels']) or len(row['tokens']) != len(row['ner_labels']):
#     print(index, len(row['tokens']), len(row['ner_labels']), len(row['pos_labels']), sep='\t')

# 3.Tokenizer

In [None]:
from transformers import XLMRobertaTokenizerFast

# Load the tokenizer
tokenizer_crf = XLMRobertaTokenizerFast.from_pretrained(model_path)

In [None]:
raw_enc = tokenizer_crf(
    full_dataset['tokens'].tolist(),          # list of list of words
    is_split_into_words=True,
    return_offsets_mapping=True,
    padding=True,
    truncation=True,
    return_tensors=None                  # <--- raw Python objects
)

# 2) Create the tensor BatchEncoding for model inputs
tensor_enc = tokenizer_crf(
    full_dataset['tokens'].tolist(),
    is_split_into_words=True,
    return_offsets_mapping=True,
    padding=True,
    truncation=True,
    return_tensors="pt"                  # <--- PyTorch tensors
)

In [None]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
import torch
import torch.nn as nn
from transformers import XLMRobertaModel, XLMRobertaPreTrainedModel

In [None]:
from torchcrf import CRF
import torch.nn as nn

class XLMRobertaForMultiTaskTokenClassificationWithCRF(XLMRobertaPreTrainedModel):
    def __init__(self, config, num_pos_labels, num_ner_labels):
        super().__init__(config)
        assert num_pos_labels > 0 and num_ner_labels > 0
        self.num_pos_labels = num_pos_labels
        self.num_ner_labels = num_ner_labels

        self.roberta = XLMRobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.classifier_pos = nn.Linear(config.hidden_size, num_pos_labels)
        self.classifier_ner = nn.Linear(config.hidden_size, num_ner_labels)

        self.crf = CRF(num_tags=num_ner_labels, batch_first=True)
        self.init_weights()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                labels_pos=None,
                labels_ner=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        seq_out = self.dropout(outputs.last_hidden_state)

        # POS
        logits_pos = self.classifier_pos(seq_out)
        loss = 0.0
        # if labels_pos is not None:
        #     loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        #     loss += loss_fct(
        #         logits_pos.view(-1, self.num_pos_labels),
        #         labels_pos.view(-1)
        #     )

        loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        loss += loss_fct(
            logits_pos.view(-1, self.num_pos_labels),
            labels_pos.view(-1)
        )
            # print(loss)

        # NER with CRF
        emissions_ner = self.classifier_ner(seq_out)

        # if labels_ner is not None:
        #     mask = attention_mask.bool()
        #     # labels_ner already has no -100
        #     log_likelihood = self.crf(
        #         emissions_ner,
        #         labels_ner,
        #         mask=mask,
        #         reduction='mean'
        #     )
        #     # print(-log_likelihood)
        #     ner_alpha = 0.3
        #     loss += -log_likelihood * ner_alpha


        mask = attention_mask.bool()
        # labels_ner already has no -100
        log_likelihood = self.crf(
            emissions_ner,
            labels_ner,
            mask=mask,
            reduction='mean'
        )
        # print(-log_likelihood)
        ner_alpha = 0.3
        loss += -log_likelihood * ner_alpha


        # Decode for inference
        ner_preds = None
        # if labels_ner is None:
        #     mask = attention_mask.bool()
        #     ner_preds = self.crf.decode(emissions_ner, mask=mask)

        mask = attention_mask.bool()
        ner_preds = self.crf.decode(emissions_ner, mask=mask)

        return {
            "loss": loss,
            "logits_pos": logits_pos,
            "emissions_ner": emissions_ner,
            "predictions_ner": ner_preds,
        }


In [None]:
# from torchcrf import CRF
# import torch.nn as nn

# class XLMRobertaForMultiTaskTokenClassificationWithCRF(XLMRobertaPreTrainedModel):
#     def __init__(self, config, num_pos_labels, num_ner_labels):
#         super().__init__(config)
#         assert num_pos_labels > 0 and num_ner_labels > 0
#         self.num_pos_labels = num_pos_labels
#         self.num_ner_labels = num_ner_labels

#         self.roberta = XLMRobertaModel(config)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)

#         self.classifier_pos = nn.Linear(config.hidden_size, num_pos_labels)
#         self.classifier_ner = nn.Linear(config.hidden_size, num_ner_labels)

#         self.crf = CRF(num_tags=num_ner_labels, batch_first=True)
#         self.init_weights()

#     def forward(self,
#                 input_ids=None,
#                 attention_mask=None,
#                 labels_pos=None,
#                 labels_ner=None):
#         outputs = self.roberta(input_ids, attention_mask=attention_mask)
#         seq_out = self.dropout(outputs.last_hidden_state)

#         # POS
#         logits_pos = self.classifier_pos(seq_out)
#         loss = 0.0
#         if labels_pos is not None:
#             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
#             loss += loss_fct(
#                 logits_pos.view(-1, self.num_pos_labels),
#                 labels_pos.view(-1)
#             )
#             # print(loss)

#         # NER with CRF
#         emissions_ner = self.classifier_ner(seq_out)

#         if labels_ner is not None:
#             mask = attention_mask.bool()
#             # labels_ner already has no -100
#             log_likelihood = self.crf(
#                 emissions_ner,
#                 labels_ner,
#                 mask=mask,
#                 reduction='mean'
#             )
#             # print(-log_likelihood)
#             ner_alpha = 0.3
#             loss += -log_likelihood * ner_alpha




#         # Decode for inference
#         ner_preds = None
#         if labels_ner is None:
#             mask = attention_mask.bool()
#             ner_preds = self.crf.decode(emissions_ner, mask=mask)

#         return {
#             "loss": loss,
#             "logits_pos": logits_pos,
#             "emissions_ner": emissions_ner,
#             "predictions_ner": ner_preds,
#         }


In [None]:
from transformers import XLMRobertaConfig

# 1) Load the saved config
config = XLMRobertaConfig.from_pretrained(model_path)


num_pos_labels = len(pos_label2id)  # Example: Adjust based on your POS label set
num_ner_labels = len(ner_label2id)   # Example: Adjust based on your NER label set

# 2) Instantiate *your* class with from_pretrained
model = XLMRobertaForMultiTaskTokenClassificationWithCRF.from_pretrained(
    model_path,
    config=config,                      # load config.json
    num_pos_labels=len(pos_label2id),   # same values you used at save time
    num_ner_labels=len(ner_label2id)
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

Some weights of the model checkpoint at /content/drive/MyDrive/Models/crf-masked/xlmroberta-multitask-tokenclf-withCRF-nomisc were not used when initializing XLMRobertaForMultiTaskTokenClassificationWithCRF: ['crf.transition_mask']
- This IS expected if you are initializing XLMRobertaForMultiTaskTokenClassificationWithCRF from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMultiTaskTokenClassificationWithCRF from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


XLMRobertaForMultiTaskTokenClassificationWithCRF(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, 

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class UnlabeledCRFDataset(Dataset):
    def __init__(self, sentences, raw_enc, tensor_enc):
        """
        sentences   : List[List[str]]        # your original split-into-words sentences
        raw_enc     : BatchEncoding (no tensors)
        tensor_enc  : BatchEncoding (with return_tensors="pt")
        """
        self.sentences      = sentences
        self.input_ids      = tensor_enc["input_ids"]
        self.attention_mask = tensor_enc["attention_mask"]
        # Extract word_ids for each example up front
        # Method 1: via the per-encoding object
        self.word_ids = [
            enc.word_ids                    # List[Optional[int]] for each token :contentReference[oaicite:0]{index=0}
            for enc in raw_enc.encodings
        ]
        # --- OR alternatively ---
        # Method 2: using the BatchEncoding API directly:
        # self.word_ids = [
        #     raw_enc.word_ids(batch_index=i)
        #     for i in range(len(sentences))
        # ]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return {
            "input_ids":      self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "words":          self.sentences[idx],   # List[str]
            "word_ids":       self.word_ids[idx]     # List[Optional[int]]
        }

In [None]:
import torch
from torch.utils.data import DataLoader

def collate_crf(batch):
    # Stack the padded tensors
    input_ids      = torch.stack([b["input_ids"]       for b in batch], dim=0)
    attention_mask = torch.stack([b["attention_mask"]  for b in batch], dim=0)
    # Leave the lists of words and word_ids as Python lists
    words    = [b["words"]    for b in batch]
    word_ids = [b["word_ids"] for b in batch]

    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "words":          words,
        "word_ids":       word_ids
    }


In [None]:
dataset = UnlabeledCRFDataset(full_dataset["tokens"].tolist(), raw_enc, tensor_enc)
loader  = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=collate_crf)

In [None]:
import torch
from torch.utils.data import DataLoader

def predict_crf_only_tokens(
    model,
    loader: DataLoader,
    device: torch.device,
    pos_id2label: dict,
    ner_id2label: dict,
    pad_ner_id: int = None
):
    model.eval()
    all_sentences, all_pos_preds, all_ner_preds = [], [], []

    with torch.no_grad():
        for batch in loader:
            # Move tensors
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            words_batch    = batch["words"]     # List[List[str]]
            word_ids_batch = batch["word_ids"]  # List[List[int or None]]

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pos_logits = outputs["logits_pos"]              # (B, T, P)
            emissions  = outputs["emissions_ner"]           # (B, T, N)

            # Token-level preds
            pos_preds_token = pos_logits.argmax(-1).cpu().tolist()
            ner_preds_token = model.crf.decode(emissions, mask=attention_mask.bool())

            # Collapse to words
            for words, wids, p_tok, n_tok in zip(words_batch, word_ids_batch, pos_preds_token, ner_preds_token):
                # 1) trim to valid length
                valid_len = attention_mask[0].sum().item()  # or per-example mask.sum()
                p_tok = p_tok[:valid_len]
                n_tok = n_tok[:valid_len]
                wids   = wids[:valid_len]

                # 2) collect first‐subword predictions
                word_pos_ids, word_ner_ids = [], []
                last_wid = None
                for wid, pid, nid in zip(wids, p_tok, n_tok):
                    if wid is None or wid == last_wid:
                        continue
                    # wid is new: the *first* subword of this word
                    word_pos_ids.append(pid)
                    word_ner_ids.append(nid)
                    last_wid = wid

                # 3) sanity check
                # assert len(word_pos_ids) == len(words), (
                #     f"Word‐pred mismatch: got {len(word_pos_ids)} preds for {len(words)} words"
                # )

                # 4) map IDs → labels
                pos_tags = [pos_id2label[i] for i in word_pos_ids]
                ner_tags = [ner_id2label[i] for i in word_ner_ids]

                # 5) optionally drop PAD‐NER words
                if pad_ner_id is not None:
                    filtered_w, filtered_p, filtered_n = [], [], []
                    for w, pt, nt, wid in zip(words, pos_tags, ner_tags, word_ner_ids):
                        if wid == pad_ner_id:
                            continue
                        filtered_w.append(w)
                        filtered_p.append(pt)
                        filtered_n.append(nt)
                    words, pos_tags, ner_tags = filtered_w, filtered_p, filtered_n

                all_sentences.append(words)
                all_pos_preds.append(pos_tags)
                all_ner_preds.append(ner_tags)

    return all_sentences, all_pos_preds, all_ner_preds


In [None]:
sentences, pos_tags, ner_tags = predict_crf_only_tokens(
    model,
    loader,
    device,
    pos_id2label,
    ner_id2label,
    pad_ner_id=ner_label2id.get("PAD")  # or None if you don’t want to filter PAD
)

In [None]:
for sent, pp, np in zip(sentences[:], pos_tags[:], ner_tags[:]):
    print("Sentence:", " ".join(sent))
    print("POS Tags:", pp)
    print("NER Tags:", np)
    print()

Sentence: bismillah hari ini kita berjuang lagi ya semoga dilancarkan dan dimudahkan dalam setiap urusan aamiin open pre order lokasi cipayung jakarta timur order h - 1 yang mau order silahkan dm aku fast respon langsung ke no whatsapp
POS Tags: ['UH', 'NN', 'PR', 'PRP', 'VB', 'RB', 'UH', 'RB', 'VB', 'CC', 'VB', 'IN', 'DT', 'NN', 'UH', 'JJ', 'RB', 'NN', 'NN', 'NNP', 'NNP', 'NNP', 'VB', 'NN', 'Z', 'CD', 'SC', 'MD', 'VB', 'RB', 'VB', 'PRP', 'JJ', 'VB', 'RB', 'IN', 'NN', 'NNP']
NER Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'B-GPE', 'I-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG']

Sentence: peringatan dini jabodetabek durasi : senin 31 januari 2022 pukul 05.45 wib s . di pukul 08.00 wib
POS Tags: ['NN', 'JJ', 'NNP', 'NN', 'Z', 'NNP', 'CD', 'NNP', 'CD', 'NN', 'CD', 'NN', 'UH', 'Z', 'IN', 'NN', 'CD', 'NN']
NER Tags: ['O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

# Phrase Builder

In [None]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import ast

In [None]:
mapping_set = pd.DataFrame({
    'tokens': sentences,
    'pos_labels': pos_tags,
    'ner_labels': ner_tags
})

In [None]:
# for col in full_dataset.columns:
#     full_dataset[col] = full_dataset[col].apply(
#         lambda x: ast.literal_eval(x) if isinstance(x, str) else x
#     )

In [None]:
def activity_phrase(sentence, pos_tags):
    activity_phrases = []
    phrase = ""
    for word, tag in zip(sentence, pos_tags):
        if tag.startswith('VB'):  # Verb
            phrase += word + " "
        elif tag.startswith('NN') and phrase:  # Noun after verb
            phrase += word + " "
            activity_phrases.append(phrase.strip())
            phrase = ""
        elif not tag.startswith(('VB', 'NN')) and len(phrase.split()) > 1:
            activity_phrases.append(phrase.strip())
            phrase = ""
        elif not tag.startswith(('VB', 'NN')):
            phrase = ""
    return activity_phrases

In [None]:
def location_phrase(sentence, ner_tags):
    location_phrases = []
    phrase = ""
    for word, tag in zip(sentence, ner_tags):
        if tag == 'B-GPE' and phrase:
            phrase += word + " "
        elif tag == 'I-GPE' and phrase:
            phrase += word + " "
        elif tag != 'I-LOC' and phrase:
            location_phrases.append(phrase.strip())
            if tag == 'B-LOC':
                phrase = word + " "
            else:
                phrase = ""
        elif tag == 'I-LOC' and phrase:
            phrase += word + " "
        elif tag == 'B-LOC':
            phrase = word + " "
    if phrase:
        location_phrases.append(phrase.strip())
    return location_phrases

In [None]:
premapped = []
for idx in range(len(mapping_set)):
    activities = activity_phrase(
        mapping_set.iloc[idx]['tokens'],
        mapping_set.iloc[idx]['pos_labels']
    )
    locations = location_phrase(
        mapping_set.iloc[idx]['tokens'],
        mapping_set.iloc[idx]['ner_labels']
    )
    for loc in locations:
        premapped.append({
            'address': loc,
            'activities': activities
        })

In [None]:
premapped_dataset = pd.DataFrame(columns=['address', 'lat', 'long', 'count', 'activities'])

In [None]:
print(len(premapped_dataset))

0


In [None]:
api_key = '' # Your Google Maps API key

In [None]:
from geopy.geocoders import GoogleV3, Nominatim
try:
    geolocator = GoogleV3(api_key=api_key , user_agent="Twittager-ID", domain="maps.google.co.id")
except:
    geolocator = Nominatim(user_agent="Twittager-ID")


In [None]:
for ent in premapped:
    raw = ent['address']
    try:
        loc = geolocator.geocode(raw)
        if loc is None:
            continue
        lat, lon, full_addr = loc.latitude, loc.longitude, loc.address
    except Exception:
        continue  # skip any lookup errors

    # See if this canonical address is already in premapped_dataset
    mask = premapped_dataset['address'] == full_addr
    if mask.any():
        # — seen before: increment count
        idx = premapped_dataset.index[mask][0]
        premapped_dataset.at[idx, 'count'] += 1

        # — merge & dedupe activities
        existing = set(premapped_dataset.at[idx, 'activities'])
        new_acts = set(ent['activities'])
        premapped_dataset.at[idx, 'activities'] = list(existing | new_acts)

    else:
        # — first time: append new row
        new_row = {
            'address':    full_addr,
            'lat':        lat,
            'long':       lon,
            'count':      1,
            'activities': list(set(ent['activities']))
        }
        premapped_dataset = pd.concat(
            [premapped_dataset, pd.DataFrame([new_row])],
            ignore_index=True
        )

  premapped_dataset = pd.concat(


In [None]:
for idx in range(len(premapped_dataset)):
  print(premapped_dataset.iloc[idx]['address'], premapped_dataset.iloc[idx]['lat'], premapped_dataset.iloc[idx]['long'], premapped_dataset.iloc[idx]['activities'], premapped_dataset.iloc[idx]['count'], sep='\t')

Bandar Udara Syamsudin Noor, Jalan Akses Terminal Baru Bandara Syamsudin Noor, Syamsudin Noor, Banjarbaru, Kalimantan Selatan, Kalimantan, 70724, Indonesia	-3.4402862	114.7565059	[]	2
Dukuh Atas BNI, Jalan Tanjung Karang, RW 02, Kebon Melati, Tanah Abang, Jakarta Pusat, Daerah Khusus ibukota Jakarta, Jawa, 10230, Indonesia	-6.2007957	106.822788	['menabrak pembatas', 'mengalami kecelakaan']	3
Gedung Extension UI, Jalan Kenari II, RW 05, Kenari, Senen, Jakarta Pusat, Daerah Khusus ibukota Jakarta, Jawa, 10430, Indonesia	-6.194382	106.846197	[]	1
Danramil 1705-02/Enarotali, Enarotali, Paniai, Papua Tengah, Western New Guinea, Indonesia	-3.9270191	136.3782692	['laksanakan pengamanan']	1
Northern Alberta Jubilee Auditorium, 11455, 87 Avenue NW, University of Alberta, Greater Strathcona, Edmonton, Alberta, T6G 2T2, Canada	53.521789	-113.5287755	[]	4
Jalan Industri, RW 03, Susukan, Ciracas, Jakarta Timur, Daerah Khusus ibukota Jakarta, Jawa, 13830, Indonesia	-6.3117428	106.8701262	[]	1
Jalan 

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from io import BytesIO
import base64

In [None]:
def make_wc_image(idx, data):

# for idx, data in premapped_dataset.iterrows():
    activities = data['activities']

    phrase_freq = Counter(activities)

    if not phrase_freq:
        wc = WordCloud(
            width=800, height=400,
            background_color='white',
            collocations=False
        ).generate(' ')

    else :
        wc = WordCloud(
            width=2000, height=1000,
            background_color='white',
            collocations=False
        ).generate_from_frequencies(phrase_freq)

    # plt.figure(figsize=(12, 6))
    # plt.imshow(wc, interpolation='bilinear')
    # plt.axis('off')
    # plt.show()

    buf = BytesIO()
    wc.to_image().save(buf, format="PNG")
    b64 = base64.b64encode(buf.getvalue()).decode()
    # embed-able HTML <img> tag
    return f'<img src="data:image/png;base64,{b64}" style="max-width:100%;">'

In [None]:
import folium
from folium.plugins import MarkerCluster

# 1. Center on Hatfield
m = folium.Map(location=[-6.193667, 106.823024], zoom_start=13)

# 2. Optional: cluster markers if you have many
cluster = MarkerCluster().add_to(m)

for idx, row in premapped_dataset.iterrows():
    lat, lon = row["lat"], row["long"]
    radius   = max(5, row["count"]**0.5)
    addr     = row["address"]
    has_acts = bool(row["activities"])

    if has_acts:
        # build HTML: header + image
        wc_img    = make_wc_image(idx, row)
        popup_html = f"""
            <div style="width:200px">
              <h4 style="margin:0 0 .5em 0;">{addr}</h4>
              {wc_img}
            </div>
        """
    else:
        # just the address in a simple popup
        popup_html = f"<b>{addr}</b>"

    # always allow interaction, but style differently
    marker = folium.CircleMarker(
        location=(lat, lon),
        radius=radius,
        color   = "blue" if has_acts else "gray",
        fill    = True,
        fill_color   = "blue" if has_acts else "lightgray",
        fill_opacity = 0.6 if has_acts else 0.4,
        popup   = folium.Popup(popup_html, min_width=2000),
        interactive=True
    )

    marker.add_to(cluster)


m.save("production_map.html")
