In [3]:
import pandas as pd

# Input and output file paths
train_file = "train.tsv"  # Replace with actual path
dev_file = "dev.tsv"      # Replace with actual path
output_file = "tokenizer_corpus.txt"

# Read TSV files
df_train = pd.read_csv(train_file, sep="\t")
df_dev = pd.read_csv(dev_file, sep="\t")

# Extract only the 'text' column and combine them
all_texts = pd.concat([df_train["text"], df_dev["text"]])

# Save to a single file (one sentence per line)
all_texts.to_csv(output_file, index=False, header=False, quoting=3, escapechar="\\")  # Escape special chars

print(f"Saved extracted texts from train & dev to {output_file}")


Saved extracted texts from train & dev to tokenizer_corpus.txt


In [8]:
import pandas as pd
### LANGUAGE TOKEN INCLUDED
def extract_language_code(id_str):
    """Extract language code from ID (first part before underscore)"""
    return id_str.split('_')[0]

def process_text(text, lang_code):
    return f"[LANG={lang_code}] {text}"  

def create_corpus(input_files, output_file):
    """Process TSV files and create training corpus"""
    corpus = []
    
    for file in input_files:
        df = pd.read_csv(file, sep='\t')
        
        for _, row in df.iterrows():
            lang_code = extract_language_code(row['id'])
            processed_text = process_text(row['text'], lang_code)
            
            # Add to corpus if text is not empty
            if processed_text.strip():
                corpus.append(processed_text)
    
    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(corpus))

if __name__ == "__main__":
    # Configuration
    INPUT_FILES = ['train.tsv', 'dev.tsv']
    OUTPUT_FILE = 'language_tokenizer_corpus.txt'
    
    # Create the corpus
    create_corpus(INPUT_FILES, OUTPUT_FILE)
    print(f"Created corpus with language tokens in {OUTPUT_FILE}")

Created corpus with language tokens in language_tokenizer_corpus.txt


In [15]:
###TESTING TOKENIZER
from tokenizers import Tokenizer
import torch

# Load the tokenizer from the JSON file
tokenizer = Tokenizer.from_file("lang_bpe_tokenizer.json")

# Define sample test cases
test_cases = [
    "[LANG=eng] Hello world",
    "[LANG=fin] Moi maailma",
    "[LANG=ger] Hallo Welt",
    "Hello world",  # No language token
    "[LANG=eng] A 17% cut in SSD prices",  # Complex sentence
    "eng Hello world",  # "eng" as a standalone word
    "[LANG=invalid] Test",  # Invalid language token
    "",  # Empty string
]

# Function to test and display tokenization results
def test_tokenization(text):
    print(f"\nInput: '{text}'")
    
    # Encode the text to get token IDs
    encoding = tokenizer.encode(text)
    token_ids = encoding.ids
    tokens = encoding.tokens
    
    # Decode back to text
    decoded_text = tokenizer.decode(token_ids, skip_special_tokens=False)
    
    # Print results
    print(f"Token IDs: {token_ids}")
    print(f"Tokens: {tokens}")
    print(f"Decoded Text: '{decoded_text}'")
    
    # Add [CLS] and [SEP] for model input simulation
    cls_id = tokenizer.token_to_id("[CLS]")
    sep_id = tokenizer.token_to_id("[SEP]")
    input_ids = [cls_id] + token_ids + [sep_id]
    print(f"Simulated Input IDs with [CLS] and [SEP]: {input_ids}")
    print(f"Simulated Decoded Input: '{tokenizer.decode(input_ids, skip_special_tokens=False)}'")

# Run tests for each sample
for test_case in test_cases:
    test_tokenization(test_case)

# Additional test: Check if tokenizer handles unknown tokens correctly
unknown_text = "This is an unknownword123"
print(f"\nInput: '{unknown_text}'")
encoding = tokenizer.encode(unknown_text)
token_ids = encoding.ids
tokens = encoding.tokens
decoded_text = tokenizer.decode(token_ids, skip_special_tokens=False)
print(f"Token IDs: {token_ids}")
print(f"Tokens: {tokens}")
print(f"Decoded Text: '{decoded_text}'")

# Check specific token IDs for language tokens
print("\nVerifying language token IDs:")
for lang in ["eng", "fin", "ger"]:
    token_id = tokenizer.token_to_id(f"[LANG={lang}]")
    print(f"ID for [LANG={lang}]: {token_id}")

# Check if "eng" as a subword is distinct
eng_id = tokenizer.token_to_id("eng")
print(f"ID for 'eng' (subword): {eng_id}")


Input: '[LANG=eng] Hello world'
Token IDs: [5, 9513, 806]
Tokens: ['[LANG=eng]', 'ĠHello', 'Ġworld']
Decoded Text: '[LANG=eng] ĠHello Ġworld'
Simulated Input IDs with [CLS] and [SEP]: [1, 5, 9513, 806, 2]
Simulated Decoded Input: '[CLS] [LANG=eng] ĠHello Ġworld [SEP]'

Input: '[LANG=fin] Moi maailma'
Token IDs: [6, 5078, 80, 447, 612, 3556]
Tokens: ['[LANG=fin]', 'ĠMo', 'i', 'Ġma', 'ail', 'ma']
Decoded Text: '[LANG=fin] ĠMo i Ġma ail ma'
Simulated Input IDs with [CLS] and [SEP]: [1, 6, 5078, 80, 447, 612, 3556, 2]
Simulated Decoded Input: '[CLS] [LANG=fin] ĠMo i Ġma ail ma [SEP]'

Input: '[LANG=ger] Hallo Welt'
Token IDs: [7, 6642, 86, 15974]
Tokens: ['[LANG=ger]', 'ĠHall', 'o', 'ĠWelt']
Decoded Text: '[LANG=ger] ĠHall o ĠWelt'
Simulated Input IDs with [CLS] and [SEP]: [1, 7, 6642, 86, 15974, 2]
Simulated Decoded Input: '[CLS] [LANG=ger] ĠHall o ĠWelt [SEP]'

Input: 'Hello world'
Token IDs: [9513, 806]
Tokens: ['ĠHello', 'Ġworld']
Decoded Text: 'ĠHello Ġworld'
Simulated Input IDs with

In [26]:
import pandas as pd

def merge_tsv_files(train_file, dev_file, output_file):
    # Read the train and dev files as dataframes
    train_df = pd.read_csv(
    train_file,
    sep='\t',
    engine='python',
    quoting=3,        # csv.QUOTE_NONE
    on_bad_lines='warn',
    # If there's no header row, use header=None
    # If you do have column names, use header=0
    header=0,
    encoding='utf-8',)
    
    dev_df = pd.read_csv(
        dev_file,
        sep='\t',
        engine='python',
        quoting=3,        # csv.QUOTE_NONE
        on_bad_lines='warn',
        # If there's no header row, use header=None
        # If you do have column names, use header=0
        header=0,
        encoding='utf-8',)
        
    # Concatenate the two dataframes
    combined_df = pd.concat([train_df, dev_df], ignore_index=True)
    
    # Write the combined dataframe to a new TSV file
    combined_df.to_csv(output_file, sep='\t', index=False)
    print(f"Combined file saved to {output_file}")

if __name__ == "__main__":
    # Specify your file names here
    train_tsv = "train.tsv"
    dev_tsv = "dev.tsv"
    output_tsv = "train+dev.tsv"
    
    merge_tsv_files(train_tsv, dev_tsv, output_tsv)


Combined file saved to train+dev.tsv


In [11]:
import pandas as pd

train_df = pd.read_csv(
    "train.tsv",
    sep='\t',
    engine='python',
    quoting=3,        # csv.QUOTE_NONE
    on_bad_lines='warn',
    # If there's no header row, use header=None
    # If you do have column names, use header=0
    header=0,
    encoding='utf-8',
)
train_df = pd.read_csv(
    "dev.tsv.tsv",
    sep='\t',
    engine='python',
    quoting=3,        # csv.QUOTE_NONE
    on_bad_lines='warn',
    # If there's no header row, use header=None
    # If you do have column names, use header=0
    header=0,
    encoding='utf-8',
)

dev_df   = pd.read_csv("dev.tsv", sep='\t')

print("Train shape:", train_df.shape)
print("Dev shape:",   dev_df.shape)


Train shape: (99000, 3)
Dev shape: (13178, 3)


In [30]:
import pandas as pd

def count_languages_in_tsv(filepath):
    # Read the TSV
    df = pd.read_csv(filepath, sep='\t')
    
    # Initialize counters
    eng_count = 0
    ger_count = 0
    fin_count = 0
    
    # Iterate over the rows
    for _, row in df.iterrows():
        # Check the 'id' column
        row_id = str(row['id'])  # Make sure it's a string
        if row_id.startswith('eng_'):
            eng_count += 1
        elif row_id.startswith('ger_'):
            ger_count += 1
        elif row_id.startswith('fin_'):
            fin_count += 1
    
    # Print or return the results
    print(f"English rows: {eng_count}")
    print(f"German rows:  {ger_count}")
    print(f"Finnish rows: {fin_count}")

# Example usage:
if __name__ == "__main__":
    tsv_file = "new_train.tsv"
    count_languages_in_tsv(tsv_file)


English rows: 110000
German rows:  2000
Finnish rows: 200


In [32]:
import pandas as pd

def split_multilingual_data(input_file, output_train, output_dev, train_frac=0.7, random_state=42):
    # Read the TSV into a DataFrame
    df = pd.read_csv(input_file, sep='\t')
    
    # Separate rows by language
    df_eng = df[df['id'].str.startswith('eng_')]
    df_ger = df[df['id'].str.startswith('ger_')]
    df_fin = df[df['id'].str.startswith('fin_')]
    
    # Shuffle & split each language
    # English
    eng_train = df_eng.sample(frac=train_frac, random_state=random_state)
    eng_dev   = df_eng.drop(eng_train.index)
    
    # German
    ger_train = df_ger.sample(frac=train_frac, random_state=random_state)
    ger_dev   = df_ger.drop(ger_train.index)
    
    # Finnish
    fin_train = df_fin.sample(frac=train_frac, random_state=random_state)
    fin_dev   = df_fin.drop(fin_train.index)
    
    # Combine splits across languages
    train_df = pd.concat([eng_train, ger_train, fin_train], ignore_index=True)
    dev_df   = pd.concat([eng_dev,   ger_dev,   fin_dev],   ignore_index=True)
    
    # Shuffle again (optional, to mix languages in the final output)
    train_df = train_df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    dev_df   = dev_df.sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    
    # Save to new TSV files
    train_df.to_csv(output_train, sep='\t', index=False)
    dev_df.to_csv(output_dev, sep='\t', index=False)
    
    # Print some stats
    print(f"Saved {len(train_df)} rows to {output_train}")
    print(f"Saved {len(dev_df)} rows to {output_dev}")

if __name__ == "__main__":
    input_tsv = "new_combined.tsv"       # Your combined file
    output_train_tsv = "new_train.tsv" # 70% split
    output_dev_tsv   = "new_dev.tsv"   # 30% split
    
    split_multilingual_data(input_tsv, output_train_tsv, output_dev_tsv, train_frac=0.7)


Saved 78540 rows to new_train.tsv
Saved 33660 rows to new_dev.tsv


In [41]:
import pandas as pd

def count_toxic_by_language(filepath):
    # Read the TSV into a DataFrame
    df = pd.read_csv(filepath, sep='\t')
    
    # Separate by language using 'id' prefix
    df_eng = df[df['id'].str.startswith('eng_')]
    df_ger = df[df['id'].str.startswith('ger_')]
    df_fin = df[df['id'].str.startswith('fin_')]
    
    # Function to count 0/1 labels
    def label_counts(sub_df):
        # Count how many label=0 and how many label=1
        toxic_count = len(sub_df[sub_df['label'] == 1])
        non_toxic_count = len(sub_df[sub_df['label'] == 0])
        return toxic_count, non_toxic_count
    
    # Count for each language
    eng_toxic, eng_non_toxic = label_counts(df_eng)
    ger_toxic, ger_non_toxic = label_counts(df_ger)
    fin_toxic, fin_non_toxic = label_counts(df_fin)
    
    # Print results
    print("English:")
    print(f"  Toxic (label=1):     {eng_toxic}")
    print(f"  Non-toxic (label=0): {eng_non_toxic}")
    print()
    print("German:")
    print(f"  Toxic (label=1):     {ger_toxic}")
    print(f"  Non-toxic (label=0): {ger_non_toxic}")
    print()
    print("Finnish:")
    print(f"  Toxic (label=1):     {fin_toxic}")
    print(f"  Non-toxic (label=0): {fin_non_toxic}")

if __name__ == "__main__":
    tsv_file = "new_combined.tsv"
    count_toxic_by_language(tsv_file)



English:
  Toxic (label=1):     40349
  Non-toxic (label=0): 69651

German:
  Toxic (label=1):     500
  Non-toxic (label=0): 1500

Finnish:
  Toxic (label=1):     150
  Non-toxic (label=0): 50


In [44]:
import pandas as pd

def count_toxic_by_language(filepath):
    # Read the TSV into a DataFrame
    df = pd.read_csv(filepath, sep='\t')
    
    # Separate by language using 'id' prefix
    df_eng = df[df['id'].str.startswith('eng_')]
    df_ger = df[df['id'].str.startswith('ger_')]
    df_fin = df[df['id'].str.startswith('fin_')]
    
    # Function to count 0/1 labels
    def label_counts(sub_df):
        # Count how many label=0 and how many label=1
        toxic_count = len(sub_df[sub_df['label'] == 1])
        non_toxic_count = len(sub_df[sub_df['label'] == 0])
        return toxic_count, non_toxic_count
    
    # Count for each language
    eng_toxic, eng_non_toxic = label_counts(df_eng)
    ger_toxic, ger_non_toxic = label_counts(df_ger)
    fin_toxic, fin_non_toxic = label_counts(df_fin)
    
    # Print results
    print("English:")
    print(f"  Toxic (label=1):     {eng_toxic}")
    print(f"  Non-toxic (label=0): {eng_non_toxic}")
    print()
    print("German:")
    print(f"  Toxic (label=1):     {ger_toxic}")
    print(f"  Non-toxic (label=0): {ger_non_toxic}")
    print()
    print("Finnish:")
    print(f"  Toxic (label=1):     {fin_toxic}")
    print(f"  Non-toxic (label=0): {fin_non_toxic}")

if __name__ == "__main__":
    tsv_file = "new_train.tsv"
    count_toxic_by_language(tsv_file)



English:
  Toxic (label=1):     28220
  Non-toxic (label=0): 48780

German:
  Toxic (label=1):     341
  Non-toxic (label=0): 1059

Finnish:
  Toxic (label=1):     105
  Non-toxic (label=0): 35
