In [31]:
import numpy as np
import pandas as pd
import sentencepiece as spm
import chardet
import re
import sys
import torch
import torch.nn as nn


In [32]:
with open("dataset/hate_speech.tsv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100KB
    print(result["encoding"])

utf-8


In [33]:
column_names = ["Codemixed", "HateOrNot"]  # Replace with actual column names
hs_df = pd.read_csv("dataset/hate_speech.tsv", encoding=result["encoding"], sep="\t", header=None, names=column_names)

hs_df.tail()

Unnamed: 0,Codemixed,HateOrNot
4574,ye attankwadi Indian agent hai jo terrorism ph...,no
4575,bola na terrorism ko support karna band karoge...,no
4576,lagta hai aap ne movie dekhi hai which is writ...,no
4577,tum log terrorism ko support karna band kardo ...,no
4578,mujhe pehele se hi pata tha so Sallu fans ke b...,yes


In [34]:
with open("dataset/Hinglish_Profanity_List.csv", "rb") as file:
    raw_data = file.read()
    result = chardet.detect(raw_data)
    print(result["encoding"])

Windows-1252


In [35]:
pf_df = pd.read_csv("dataset/Hinglish_Profanity_List.csv", encoding=result['encoding'])

pf_df.tail()

Unnamed: 0,badir,idiot,1
203,vahiyaat,disgusting,4
204,jihadi,terrorist,4
205,atankvadi,terrorist,4
206,atankwadi,terrorist,4
207,aatanki,terorist,4


In [36]:
def clean_codemixed_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)         # remove links
    text = re.sub(r"@\S+", "", text)                   # remove @ and the word after it
    text = re.sub(r"#", "", text)                      # remove just the '#' symbol
    text = re.sub(r"\s+", " ", text).strip()           # remove extra whitespace
    return text

# Apply to 'Codemixed' column
hs_df["Codemixed"] = hs_df["Codemixed"].astype(str).apply(clean_codemixed_text)

In [37]:
hs_df["Codemixed"][-6:-1]

4573    pehle confirm karo ke Mohammad ne sach mein 8 ...
4574    ye attankwadi Indian agent hai jo terrorism ph...
4575    bola na terrorism ko support karna band karoge...
4576    lagta hai aap ne movie dekhi hai which is writ...
4577    tum log terrorism ko support karna band kardo ...
Name: Codemixed, dtype: object

In [38]:
hs_df['Codemixed'].to_csv('code_mixed.txt', index=False, header=False)

In [39]:
spm.SentencePieceTrainer.train('--input=code_mixed.txt --model_prefix=m --vocab_size=2000')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=code_mixed.txt --model_prefix=m --vocab_size=2000
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: code_mixed.txt
  input_format: 
  model_prefix: m
  model_type: UNIGRAM
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  b

In [40]:
sp = spm.SentencePieceProcessor()
sp.load('m.model')


True

In [41]:
pieces = sp.encode_as_pieces("Tu kitna gandu hai")
ids = sp.encode_as_ids("Tu kitna gandu hai")

print("Pieces:", pieces)
print("IDs:", ids)


Pieces: ['▁Tu', '▁kitna', '▁gand', 'u', '▁hai']
IDs: [597, 690, 555, 39, 3]


In [42]:
token_ids = torch.tensor(ids)  # shape: [seq_len]

# Settings
vocab_size = 8000         # Set this to your SentencePiece vocab size
embedding_dim = 768       # You can use 300, 512, 768 etc.
max_seq_len = 512         # Maximum length of your sequences

# 1. Token Embedding layer
token_embedding = nn.Embedding(vocab_size, embedding_dim)
token_embed_output = token_embedding(token_ids)  # shape: [seq_len, embedding_dim]

# 2. Positional Embedding layer
pos_embedding = nn.Embedding(max_seq_len, embedding_dim)

# Create position IDs: [0, 1, 2, ..., seq_len-1]
positions = torch.arange(0, len(token_ids)).unsqueeze(0)  # shape: [1, seq_len]
position_embed_output = pos_embedding(positions).squeeze(0)  # shape: [seq_len, embedding_dim]

# 3. Add token embeddings and positional embeddings
final_embedding = token_embed_output + position_embed_output  # shape: [seq_len, embedding_dim]

# Now `final_embedding` is ready to feed into RNN/HAN etc.
print(final_embedding)
