In [1]:
import pandas as pd
import numpy as np

def extract_burmese_sentences(input_file, output_file="output_sentences.xlsx"):
    # Read the Excel file
    df = pd.read_csv(input_file)

    # Replace NaN with empty string to avoid issues
    df = df.fillna("")

    # Flatten all columns into one single column as lines
    all_text = df.astype(str).apply(lambda x: '\n'.join(x), axis=1).str.cat(sep='\n')

    # Split by newline, clean whitespace, and skip empty lines
    lines = [line.strip() for line in all_text.split('\n') if line.strip()]

    # Split sentences
    sentences = []
    for line in lines:
        if "။" in line:
            parts = line.split("။")
            for part in parts:
                clean_part = part.strip()
                if clean_part:
                    sentences.append(clean_part + "။")
        else:
            # Keep as-is if no "။" and not empty
            if line.strip():
                sentences.append(line)

    # Final cleanup: drop empty or NaN-like strings
    sentences = [s for s in sentences if s and s.lower() != 'nan']

    # Create DataFrame and export
    result_df = pd.DataFrame(sentences, columns=["Sentence"])
    result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"✅ Cleaned sentences saved to: {output_file}")


In [2]:
import sys
print(sys.version)

import tensorflow as tf
print(tf.__version__)

import numpy as np
print(np.__version__)

3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


2.15.0
1.26.4


In [2]:
extract_burmese_sentences("ner_final_cleaned.csv", output_file="ner_final_cleaned_1.csv")

✅ Cleaned sentences saved to: ner_final_cleaned_1.csv


In [5]:
import re

def is_unwanted(line):
    # Step 1: Normalize line by removing all invisible characters
    line = str(line)
    line = re.sub(r"[\r\n\t\u200b\u2028\u00a0]", "", line)  # remove invisible unicode
    line = line.strip()
    line = re.sub(r"\s+", "", line)  # remove all spaces

    # Step 2: Remove formatting characters like comma, slash, dash
    digits_only = re.sub(r"[,\-/]", "", line)

    # Step 3: Matching rules
    return (
        line.startswith("https://www.bbc.com/burmese/")
        or re.fullmatch(r"\d+", digits_only)                # 5386538 or 5487018
        or re.fullmatch(r"\d+\.\d+", digits_only)           # decimal number
    )

In [None]:

def clean_lines(input_file, output_file="cleaned_file.xlsx"):
    # Read Excel file
    df = pd.read_excel(input_file)

    # Replace NaNs with empty strings
    df = df.fillna("")

    # Extract lines from all columns if needed
    if df.shape[1] > 1:
        all_text = df.astype(str).apply(lambda x: '\n'.join(x), axis=1).str.cat(sep='\n')
        lines = [line.strip() for line in all_text.split('\n') if line.strip()]
    else:
        lines = df.iloc[:, 0].astype(str).str.strip().tolist()
        lines = [line for line in lines if line]


    # Filter out unwanted lines
    cleaned_lines = [line for line in lines if not is_unwanted(line)]

    # Save to Excel
    cleaned_df = pd.DataFrame(cleaned_lines, columns=["Line"])
    cleaned_df.to_excel(output_file, index=False)
    print(f"✅ Cleaned file saved to: {output_file}")


In [None]:
clean_lines("mdn_cele_details.xlsx", "mdn_cele_details_cleaned.xlsx")

✅ Cleaned file saved to: bbc_trading_cleaned.xlsx


In [11]:
# from myTokenize import SyllableTokenizer
# from myTokenize import WordTokenizer
import sys
print(sys.version)
import pandas as pd

import tensorflow as tf
print(tf.__version__)

import numpy as np
print(np.__version__)

# from myTokenize as mt
# print(mt.__version__)

3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


2.15.0
1.26.4


In [2]:
import re
import pandas as pd

def add_spaces_around_numbers(input_csv, output_csv, text_column='text'):
    burmese_digits = "၀၁၂၃၄၅၆၇၈၉"
    western_digits = "0123456789"

    # Regex for digit sequences (Burmese and/or Western), surrounded by non-digit characters
    digit_pattern = re.compile(
        rf'(?<![\s{burmese_digits}{western_digits}])([{burmese_digits}{western_digits}]+)(?![\s{burmese_digits}{western_digits}])'
    )

    def process_text(text):
        # Only add spaces around digit sequences not already spaced and not part of a word
        text = digit_pattern.sub(r' \1 ', text)
        # Normalize multiple spaces
        return re.sub(r'\s+', ' ', text).strip()

    df = pd.read_csv(input_csv, encoding='utf-8-sig')
    df[text_column] = df[text_column].astype(str).apply(process_text)
    df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"✅ Saved to {output_csv}")


In [3]:
add_spaces_around_numbers(
    input_csv='tokenized_rawnews.csv',
    output_csv='cleaned_raw_news.csv',
    text_column='tokens'  # <-- Change if your column name is different
)

✅ Saved to cleaned_raw_news.csv


In [10]:
import pandas as pd
import os

#input_file = os.path.abspath(os.path.join('datasets','tokenized_ner.csv'))
input_file = 'ner_final_cleaned_3.csv'  # Adjust as needed
text_column = 'Sentence'  # Change if your column name is different
# Load your file (adjust file name and format as needed)
df = pd.read_csv(input_file)  # or pd.read_excel('your_file.xlsx')

# Make sure your text column is in string format
df[text_column] = df[text_column].astype(str)

# Add a new column with text length
df['text_length'] = df[text_column].str.len()

# Find the longest text
longest_text = df.loc[df['text_length'].idxmax()]
print("Longest Text:")
print(longest_text[text_column])
print("Length:", longest_text['text_length'])

# Find the 5 shortest texts
shortest_texts = df.nsmallest(5, 'text_length')
print("\n5 Shortest Texts:")
for i, row in shortest_texts.iterrows():
    print(f"- ({row['text_length']} chars) {row[text_column]}")

Longest Text:
မေမြို့ကအဆောက်အဦ ၃၁ လုံး National Heritage စာရင်းဝင်တယ်ဆိုတာကို Maymyo GuideBook fb page မှာ မြင်မိပါတယ်၊ ကျမတို့မိသားစုနေခဲ့တဲ့ အဖေ့ရုံးကဝန်ထမ်းအိမ်ယာ "ငွေသော်တာ"ကလည်း အမှတ်စဉ် ၂၇ မှာ ပါတယ်ဆိုတော့ ပိုပြီးဝမ်းသာရ လွမ်းရပါတယ်၊ National Heritage စာရင်းဝင်တဲ့ အဆောက်အဦများနဲ့ပါတ်သက်ပြီး မှတ်မိသမျှ ပြောပါရစေ၊ ကျမတို့ငယ်ငယ်က "နန်းမြိုင်"က ဆောက်လုပ်ရေးဧည့်ရိပ်သာပါ၊ ဟိုတယ်မဟုတ်ပါဘူး၊ "ယုဇနမြိုင် သဇင်မြိုင် ချယ်ရီမြိုင်" ဆိုတာထက် အစိုးရဧည့်ဂေဟာအမှတ် ၁/၂/၃ လို့ပဲ စိတ်ထဲစွဲနေတာက အိမ်တော်လမ်းကအဆောက်အဦတွေလို့ ထင်ပါတယ်၊ နန်းမြိုင်နဲ့ ဧည့်ဂေဟာ ၁/၂/၃ ကို အဖေနဲ့ ပါသွားဖူးပါတယ်၊ "ဇီဝက"ကတော့ ခေတ်အဆက်ဆက် ဆေးရုံအုပ်ကြီးအိမ်ပါ၊ ကျမတို့မေမြို့မှာရှိစဉ်က ဆေးရုံအုပ်ကြီး(ကလေးအထူးကုဆရာဝန်ကြီး)အိမ်မှာ ကျမတို့မောင်နှမ အဖျားအနာရှိရင် သွားပြရပါတယ်၊ အမေ မန္တလေးပြောင်းသွားပြီး အဖေက ကလေး၂ယောက်နဲ့ ကျန်ခဲ့ချိန်မှာ ကလေးတွေနေမကောင်းလို့ အမေ့ဆီကိုဖုန်းဆက်ပြောရင် အမေက ဆရာကြီးအိမ်သွားပြပါလို့ပဲ ပြောပါတယ်၊ "ဇီဝက" ကို ကျော်သွားပြီး ကလပ်လမ်းထဲချိုးဝင်လိုက်ရင် "ငွေသော်တာ" ရှိပါတယ်၊ အဲဒီကလပ်လမ်းပေါ်မှာပဲ ငွေသော်တာကို

In [5]:
import pandas as pd

def clean_text_column(input_file, output_file, column_name='text'):
    """
    Reads a CSV file, cleans the specified text column by stripping spaces
    and removing duplicates, prints shape before and after, and saves to output.

    Parameters:
        input_file (str): Path to input CSV file.
        output_file (str): Path to save the cleaned CSV file.
        column_name (str): Column name to clean. Default is 'text'.
    """

    # Read CSV
    df = pd.read_csv(input_file, encoding='utf-8')
    print(f"Original shape: {df.shape}")

    # Strip leading/trailing spaces from the specified column
    df[column_name] = df[column_name].astype(str).str.strip()

    # Drop duplicate rows based on the specified column
    df = df.drop_duplicates(subset=[column_name])
    print(f"Cleaned shape: {df.shape}")

    # Save the cleaned DataFrame to CSV
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Cleaned CSV saved to: {output_file}")


In [6]:
input_file ='ner_final_cleaned_1.csv'
clean_text_column(input_file, output_file='ner_final_cleaned_2.csv', column_name='Sentence')

Original shape: (72656, 1)
Cleaned shape: (71756, 1)
Cleaned CSV saved to: ner_final_cleaned_2.csv


In [7]:
import pandas as pd
import re

def clean_text_noise(input_file, output_file, text_column='text'):
    """
    Reads a CSV, removes noisy HTML entities and patterns from the text column, and saves the cleaned version.
    
    Parameters:
    - input_file (str): Path to the input CSV file
    - output_file (str): Path to save the cleaned CSV file
    - text_column (str): Name of the text column to clean (default = 'text')
    """

    def clean_text(text):
        if pd.isnull(text):
            return text
        # Remove patterns like &#1234; or &#1234 or &#;
        text = re.sub(r'&#\d{1,5};?', '', text)
        # Remove standalone HTML entities like &#8221, &quot;, etc.
        text = re.sub(r'&[#a-zA-Z0-9]+;?', '', text)
        # Remove things like […] or anything inside [ ]
        text = re.sub(r'\[.*?\]', '', text)
        # Remove weird unicode quotes and extra spaces
        text = text.replace('”', '').replace('“', '').replace('…', '')
        return text.strip()

    # Read file
    df = pd.read_csv(input_file, encoding='utf-8-sig')

    # Show shape before
    print("Before cleaning:", df.shape)

    # Clean text column
    if text_column in df.columns:
        df[text_column] = df[text_column].astype(str).apply(clean_text)
    else:
        raise ValueError(f"'{text_column}' column not found in the CSV.")

    # Drop duplicates and strip trailing spaces
    df[text_column] = df[text_column].str.strip()
    df = df.drop_duplicates(subset=[text_column])

    # Show shape after
    print("After cleaning:", df.shape)

    # Save cleaned file
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Cleaned CSV saved to {output_file}")


In [8]:
clean_text_noise('ner_final_cleaned_2.csv', 'ner_final_cleaned_3.csv', text_column='Sentence')

Before cleaning: (71740, 1)
After cleaning: (71734, 1)
Cleaned CSV saved to ner_final_cleaned_3.csv


In [31]:
import json
from typing import List

def jsonl_to_bio(jsonl_path: str, output_path: str):
    def label_tokens(text, entities):
        tags = ['O'] * len(text)
        for ent in entities:
            start, end, label = ent['start_offset'], ent['end_offset'], ent['label']
            tags[start] = f"B-{label}"
            for i in range(start + 1, end):
                tags[i] = f"I-{label}"
        return tags

    with open(jsonl_path, encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            example = json.loads(line)
            text = example['text']
            entities = example['entities']
            if not text.strip():
                continue

            char_tags = label_tokens(text, entities)

            for char, tag in zip(text, char_tags):
                if char.strip() == "":
                    continue  # skip whitespace
                outfile.write(f"{char} {tag}\n")
            outfile.write("\n")  # sentence boundary

In [54]:
import json

def jsonl_to_bioes_token_spans(jsonl_path, output_path):
    def get_spans(text, entities):
        spans = []
        last = 0
        entities = sorted(entities, key=lambda x: x['start_offset'])
        for ent in entities:
            start, end = ent['start_offset'], ent['end_offset']
            label = ent.get('label') or ent.get('labels')
            # Non-entity span before this entity
            if start > last:
                spans.append((text[last:start], 'O'))
            # Entity span
            ent_text = text[start:end]
            print(f"Entity: {ent_text}, Label: {label}")
            ent_len = end - start
            if ent_len == 1:
                spans.append((ent_text, f'S-{label}'))
            else:
                # BIOES for multi-token entity
                # If you want to split entity span into sub-tokens, do it here
                spans.append((ent_text, f'B-{label}'))
                # No further splitting, so E- for last token
                # If you want to split into syllables/words, you can add that logic
                # For now, treat the whole span as one token
                # If more than 2 chars, add I- tags for middle chars
                if ent_len > 2:
                    # Middle part
                    middle = ent_text[1:-1]
                    if middle:
                        spans.append((middle, f'I-{label}'))
                # End part
                if ent_len > 1:
                    spans.append((ent_text[-1], f'E-{label}'))
            last = end
        # Remaining non-entity span
        if last < len(text):
            spans.append((text[last:], 'O'))
        return spans

    with open(jsonl_path, encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            example = json.loads(line)
            text = example['text']
            entities = example.get('entities', [])
            spans = get_spans(text, entities)
            for token, tag in spans:
                token = token.strip()
                if token:
                    outfile.write(f"{token} {tag}\n")
            outfile.write("\n")  # sentence boundary

# Usage:
jsonl_to_bioes_token_spans("all.jsonl", "output_bioes.conll")

Entity: text, Label: ORG
Entity: ဒေ, Label: Title
Entity: ါ်အောင်ဆန, Label: PER
Entity: ဒေ, Label: Title
Entity: ် အောင်ဆန, Label: PER
Entity: ဒေ, Label: Title
Entity: ါ်အောင်ဆန်, Label: PER
Entity: ကျောင်းသား လက်န, Label: ORG
Entity:  တပ်မမှူး ဗို, Label: ORG
Entity: ျောင်, Label: ORG
Entity: ိမ, Label: ORG
Entity: ှိ သူ, Label: ORG
Entity: ့ လူ ချင်း , Label: Title
Entity: က် ချိန, Label: ORG
Entity: ြို့ , Label: ORG
Entity: အမျိုးသား ညီညွတ် ရ, Label: ORG
Entity: အစိ, Label: ORG
Entity: ေသ ရဲ့ အင်, Label: Title
Entity: ာင်း, Label: ORG
Entity:  ပေ ့ါ, Label: ORG
Entity:  အတွင်း, Label: ORG
Entity: ှာ တ, Label: ORG
Entity: ABS, Label: ORG


In [51]:
jsonl_to_bioes("all.jsonl", "output_bioes.conll")

In [1]:
import spacy

def check_ner_entities(text, model_name="en_core_web_sm"):
    # Load the spaCy model (download if necessary: python -m spacy download en_core_web_sm)
    nlp = spacy.load(model_name)

    # Process the input text
    doc = nlp(text)

    # Extract entities
    entities = []
    for ent in doc.ents:
        entities.append({
            "text": ent.text,
            "label": ent.label_,
            "start_char": ent.start_char,
            "end_char": ent.end_char
        })

    return entities

# Example usage
text = "Apple is looking at buying U.K. startup for $1 billion in 2025."
result = check_ner_entities(text)

for ent in result:
    print(f"Entity: {ent['text']}, Label: {ent['label']}, Start: {ent['start_char']}, End: {ent['end_char']}")

Entity: Apple, Label: ORG, Start: 0, End: 5
Entity: U.K., Label: GPE, Start: 27, End: 31
Entity: $1 billion, Label: MONEY, Start: 44, End: 54
Entity: 2025, Label: DATE, Start: 58, End: 62


In [None]:
text= 'ဒေါ်အောင်ဆန်းစုကြည်သည် မြန်မာနိုင်ငံ၏ ပထမဆုံး အမျိုးသမီး ဥက္ကဌ ဖြစ်သည်။' 
print(text[0:4])
print(text[3:7])

ဒေါ်
်အော


: 

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
