# 📚 Import Libraries

In [1]:
# -------------------------------
# Import Required Libraries
# -------------------------------
# - pandas: data handling
# - sentencepiece: tokenizer training
# - zipfile, os: file handling
# - collections.Counter: word frequency analysis
# - For displaying

import pandas as pd
import sentencepiece as spm
import zipfile
import os
from collections import Counter
from IPython.display import display, Markdown


# 🗂️ Load and View Datasets

## 🔍 Insight of Dataset

✨ Dataset has been prepared with the help of ChatGPT, Gemeni, and Grok for Bangla-English Text Generation and Translation

In [2]:
#=================================
# Train Dataset
#=================================
traindataset = pd.read_csv("/kaggle/input/bangla-english-custom-dataset/final-datasets/train-data.csv")
print(f"Shape of traindataset: {traindataset.shape}")
print("============================About Training Dataset===========================")
traindataset.head(5)

Shape of traindataset: (4008, 2)


Unnamed: 0,Text,Target
0,Apni kemon achhen ajke? Ami chinta korchilam a...,How are you today? I was thinking whether you ...
1,Apni recently kemon feel korchen? Ami chinta k...,How have you been feeling lately? I was thinki...
2,Apni ki ajke kichu emotional feel korchen? Ami...,Are you feeling emotional today? I was thinkin...
3,Apni ki recently kono boro decision niyechhen?...,Have you recently made any big decision? I was...
4,Apni ki recently kichu kichu stress feel korch...,Have you recently felt some stress? I was thin...


## About Dataset

In [4]:
print(f"Bangla English mixed written in English: \n {traindataset.iloc[300]}\n")



print(f"Bangla English mixed Bangla words are written Bangla and English words are written in English: \n {traindataset.iloc[300]}\n")

print(f"Purely written in Bangla:\n {traindataset.iloc[1600]}")

Bangla English mixed written in English: 
 Text      Apni ki ajke parcel pick-up request korben? Am...
Target    Will you request a parcel pick-up today? I was...
Name: 300, dtype: object

Bangla English mixed Bangla words are written Bangla and English words are written in English: 
 Text      Apni ki ajke parcel pick-up request korben? Am...
Target    Will you request a parcel pick-up today? I was...
Name: 300, dtype: object

Purely written in Bangla:
 Text      দয়া করে আজকের ডেডলাইন মনে রাখুন। আমরা সময়মতো...
Target    Please remember today’s deadline. We want to f...
Name: 1600, dtype: object


In [5]:
#=================================
# Valid Dataset
#=================================

valdataset = pd.read_csv("/kaggle/input/bangla-english-custom-dataset/final-datasets/val-data.csv")
print(f"Shape of valdataset: {valdataset.shape}")
print("============================About Validation Dataset===========================")
valdataset.head(5)

Shape of valdataset: (945, 2)


Unnamed: 0,Text,Target
0,Apnar kotha shune amar khub shanti lage. Kichu...,Listening to you gives me peace. I was a bit s...
1,Apni ajker jonno ki plan korechhen? Jodi time ...,What plans do you have for today? If you have ...
2,Ajke office e khub pressure chhilo. Apni ki kh...,There was a lot of pressure at the office toda...
3,Ajke onek rush chhilo rastay. Apni ki safe e b...,The roads were very rushed today. Did you retu...
4,Apnar shathe kotha bole amar mon ektu halka ho...,Talking with you makes my mind lighter. I try ...


In [6]:
#=================================
# Unseen Dataset  / Test Dataset
#=================================

unseendataset = pd.read_csv("/kaggle/input/bangla-english-custom-dataset/final-datasets/test-data.csv")
print(f"Shape of unseendataset: {unseendataset.shape}")
print("============================About Unseen Dataset===========================")
unseendataset.head(5)

Shape of unseendataset: (167, 2)


Unnamed: 0,Text,Target
0,শুভ নববর্ষ! পহেলা বৈশাখে আপনি কি নতুন জামা পরে...,Happy New Year! Rocking a new outfit for Pohel...
1,জন্মদিনের শুভেচ্ছা! আপনি কি জন্মদিনে কেক কাটবে...,Happy Birthday! Cutting a cake for your birthd...
2,শুভ রবীন্দ্র জয়ন্তী! আপনি কি রবীন্দ্রনাথের গা...,Happy Rabindra Jayanti! Singing Tagore’s songs...
3,পহেলা ফাল্গুনের শুভেচ্ছা! আপনি কি ফুলের জামা প...,Happy Pohela Falgun! Wearing a floral outfit? ...
4,ভালোবাসা দিবসের শুভকামনা! আপনি কি প্রিয়জনের স...,Happy Valentine’s Day! Planning a dinner with ...


# 🔄 Merging Train and Val Dataset

In [7]:
#====================================================
# Joining Train and Valid Data for Vocab Extraction
#====================================================

# Merge train & validation datasets by row
finaldataset = pd.concat([traindataset, valdataset], axis=0, ignore_index=True)

# Use full dataset since total rows = 4953
subset = finaldataset.sample(n=finaldataset.shape[0], random_state=42)

print(f"✅ Shape of Dataset after merging: {finaldataset.shape}")
print("============== Final Dataset After Merging ==================")
finaldataset.head(5)


✅ Shape of Dataset after merging: (4953, 2)


Unnamed: 0,Text,Target
0,Apni kemon achhen ajke? Ami chinta korchilam a...,How are you today? I was thinking whether you ...
1,Apni recently kemon feel korchen? Ami chinta k...,How have you been feeling lately? I was thinki...
2,Apni ki ajke kichu emotional feel korchen? Ami...,Are you feeling emotional today? I was thinkin...
3,Apni ki recently kono boro decision niyechhen?...,Have you recently made any big decision? I was...
4,Apni ki recently kichu kichu stress feel korch...,Have you recently felt some stress? I was thin...


# 📝 Generate Corpus

In [8]:
# Load CSV
# Merge both columns into one text corpus
corpus = pd.concat([subset["Text"], subset["Target"]], axis=0)

# Save to plain text file for tokenizer training
with open("/kaggle/working/corpus.txt", "w", encoding="utf-8") as f:
    for line in corpus:
        f.write(str(line).strip() + "\n")

In [9]:
# -------------------------------
# Analyze Corpus: Word Count & Frequency
# -------------------------------
# - Reads the merged corpus.txt
# - Counts total words and unique words
# - Displays top 30 most frequent words

# -------------------------------
# Read the corpus
# -------------------------------
with open("corpus.txt", "r", encoding="utf-8") as f:
    text = f.read()

# -------------------------------
# Split text into words (basic whitespace tokenization)
# -------------------------------
words = text.split()

# -------------------------------
# Count total and unique words
# -------------------------------
total_words = len(words)
unique_words = len(set(words))

print(f"Total words in corpus: {total_words}")
print(f"Unique words in corpus: {unique_words}")

# -------------------------------
# Compute word frequencies
# -------------------------------
word_freq = Counter(words).most_common(30)

# -------------------------------
# Display top 30 frequent words
# -------------------------------
print("\nTop 30 words:\n", word_freq)


Total words in corpus: 251684
Unique words in corpus: 17675

Top 30 words:
 [('I', 6125), ('the', 5804), ('a', 4241), ('to', 2615), ('আমি', 2219), ('you', 2162), ('is', 2009), ('was', 1980), ('and', 1859), ('very', 1661), ('করেছি।', 1446), ('The', 1337), ('properly', 1335), ('with', 1238), ('or', 1234), ('new', 1180), ('Apni', 1164), ('এবং', 1149), ('for', 1111), ('ki', 1097), ('my', 1031), ('will', 943), ('Friends', 941), ('কি', 913), ('in', 908), ('now!', 891), ('এখন', 835), ('খুব', 829), ('of', 829), ('Ami', 822)]


# ⏳ Sentence Tokenization For Bangla & English Mixed Corpus

In [22]:
# -------------------------------
# Train a SentencePiece tokenizer
# -------------------------------
# - corpus.txt: contains your source language + English text
# - custom_spm.model & custom_spm.vocab: output files
# - vocab_size: number of tokens/subwords in the vocabulary
# - character_coverage: how much of your text characters to cover (0.9995 ~ 99.95%)
# - model_type: 'bpe' (Byte-Pair Encoding) for mixed-language & code
# - shuffle_input_sentence: whether to shuffle sentences during training
# - input_sentence_size: maximum number of sentences to use for training

# Path to your merged corpus
corpus_path = "/kaggle/working/corpus.txt"

# output files: custom_spm.model + custom_spm.vocab
# matches unique words count
# cover 99.95% of characters
# subword algorithm
# shuffle sentences
# upper limit for training sentences
# use 4 CPU threads (adjust if needed)

# Train the SentencePiece tokenizer
spm.SentencePieceTrainer.train(
    input=corpus_path,
    model_prefix="custom_spm",
    vocab_size=9338,                 # must not exceed runtime limit
    character_coverage=0.9995,
    model_type="unigram",            # better for mixed-language
    byte_fallback=True,              # fallback for rare characters
    split_by_whitespace=True,        # preserve full words
    shuffle_input_sentence=True,
    input_sentence_size=100000,      
    max_sentence_length=2048,        
    num_threads=4
)




sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /kaggle/working/corpus.txt
  input_format: 
  model_prefix: custom_spm
  model_type: UNIGRAM
  vocab_size: 9338
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 100000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 2048
  num_threads: 4
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differentia

**⚠️ Note: **Sometimes the cell shows **`*`** for a long time — go to Run → Restart & Clear Cell Outputs and re-run.
📄 After that, **`custom_spm.model`** and **`custom_spm.vocab`** will appear in your working directory.

**Process will hardly take 5 - 10 min for Sentence Tokenization.**

In [23]:
print("✅ SentencePiece tokenizer training complete!")
print("Generated files: custom_spm.model, custom_spm.vocab")

# -------------------------------
# Test the tokenizer
# -------------------------------
sp = spm.SentencePieceProcessor(model_file="custom_spm.model")

✅ SentencePiece tokenizer training complete!
Generated files: custom_spm.model, custom_spm.vocab


# ✅ Result

In [24]:

# Example test sentences (Bangla + English)
samples = [
    "আমি আজকে school এ যাচ্ছি",
    "This dataset is really useful for NLP.",
    "বাংলা এবং English একসাথে আছে।",
    "Ami ajke apnar sathe meet korbo"
]

for s in samples:
    ids = sp.encode(s, out_type=int)
    tokens = sp.encode(s, out_type=str)
    print("\nInput:", s)
    print("Tokens:", tokens)
    print("IDs:", ids)



Input: আমি আজকে school এ যাচ্ছি
Tokens: ['▁আমি', '▁আজকে', '▁school', '▁এ', '▁যাচ্ছি']
IDs: [274, 318, 1431, 295, 2488]

Input: This dataset is really useful for NLP.
Tokens: ['▁', 'This', '▁data', 'set', '▁is', '▁real', 'ly', '▁useful', '▁for', '▁N', 'L', 'P', '.']
IDs: [262, 424, 1998, 6083, 278, 1024, 494, 2096, 297, 2555, 9330, 3269, 259]

Input: বাংলা এবং English একসাথে আছে।
Tokens: ['▁বা', 'ং', 'লা', '▁এব', 'ং', '▁Eng', 'lish', '▁এক', 'সাথে', '▁আছে', '।']
IDs: [1169, 292, 2313, 294, 292, 6500, 4854, 393, 546, 497, 260]

Input: Ami ajke apnar sathe meet korbo
Tokens: ['▁Am', 'i', '▁ajke', '▁apna', 'r', '▁sath', 'e', '▁meet', '▁korbo']
IDs: [307, 273, 469, 876, 341, 783, 279, 1439, 1011]


In [25]:
# Example sentences
sentences = [
    "আমি আজকে school এ যাচ্ছি",
    "This dataset is really useful for NLP.",
    "বাংলা এবং English একসাথে আছে।"
]

# Prepare Markdown table
md_table = "| Input | Tokens | IDs |\n|-------|--------|----|\n"

for sentence in sentences:
    tokens = sp.encode(sentence, out_type=str)
    ids = sp.encode(sentence, out_type=int)
    md_table += f"| {sentence} | {tokens} | {ids} |\n"

# Display as Markdown
display(Markdown(md_table))

| Input | Tokens | IDs |
|-------|--------|----|
| আমি আজকে school এ যাচ্ছি | ['▁আমি', '▁আজকে', '▁school', '▁এ', '▁যাচ্ছি'] | [274, 318, 1431, 295, 2488] |
| This dataset is really useful for NLP. | ['▁', 'This', '▁data', 'set', '▁is', '▁real', 'ly', '▁useful', '▁for', '▁N', 'L', 'P', '.'] | [262, 424, 1998, 6083, 278, 1024, 494, 2096, 297, 2555, 9330, 3269, 259] |
| বাংলা এবং English একসাথে আছে। | ['▁বা', 'ং', 'লা', '▁এব', 'ং', '▁Eng', 'lish', '▁এক', 'সাথে', '▁আছে', '।'] | [1169, 292, 2313, 294, 292, 6500, 4854, 393, 546, 497, 260] |


# 📂 Zip the relavant file for download

In [26]:
# -------------------------------
# Zip Corpus and Tokenizer Files
# -------------------------------
# - Include corpus.txt, custom_spm.model, custom_spm.vocab
# - Save as tokenizer_corpus.zip

files_to_zip = ["corpus.txt", "custom_spm.model", "custom_spm.vocab"]
zip_filename = "tokenizer_corpus.zip"

with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_zip:
        zipf.write(file, os.path.basename(file))  # save without full path

print(f"ZIP file created: {zip_filename}")


ZIP file created: tokenizer_corpus.zip
