In [9]:
import pandas as pd
import re
import string
import numpy as np
import sentencepiece as spm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from typing import List


In [10]:
def remove_single_characters(tokens: List[str]) -> List[str]:
    """Removes single-character tokens."""
    return [token for token in tokens if len(token) > 1]

In [11]:
def clean_text(text: str) -> str:
    """Cleans text by removing unwanted symbols, URLs, HTML, and numbers."""
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text

In [12]:
def lower_case_everything(t: str) -> str:
    return t.lower()

def replace_all_caps(tokens: List[str]) -> List[str]:
    return [f'xxup {t.lower()}' if t.isupper() else t for t in tokens]

def deal_caps(tokens: List[str]) -> List[str]:
    return [f'xxmaj {t}' if t.istitle() else t for t in tokens]

def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

In [13]:
custom_pre_rules = [lower_case_everything, handle_all_caps, handle_upper_case_first_letter]

In [14]:
def preprocess_text(text: str) -> str:
    """Applies all preprocessing rules."""
    text = clean_text(str(text))
    for rule in custom_pre_rules:
        text = rule(text)
    return text

In [15]:
class CodeMixedTanglishTokenizer:
    def __init__(self, model_path: str):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(model_path)

    def __call__(self, items: List[str]) -> List[List[str]]:  
        return [self.sp.EncodeAsPieces(t) for t in items]

    def tokenizer(self, items: List[str]) -> List[List[str]]:
        return [self.sp.EncodeAsPieces(t) for t in items]

In [16]:
tokenizer = CodeMixedTanglishTokenizer("../Tokenizer/Tanglish/taen_spm.model")

In [17]:
from tensorflow.keras.models import load_model
bilstm_model = load_model("./Classification/bilstm_model1.h5")



In [11]:
df = pd.read_csv("Dataset/Main/test.csv")

In [2]:
h = "punda loosu koothi kuthi thevidiya pundai oombu umbu watha ootha otha thayoli chi ommala mental kevalama nak oomba koodhi thevudiya police gommala kiss thevdiya payale thevidya lusu karumam that pombala kevalam vanitha podi mooditu fuck etha paithiyam eva"

In [4]:
tan_texts = h.split(' ')

In [5]:
tan_texts[0:5]

['punda', 'loosu', 'koothi', 'kuthi', 'thevidiya']

In [6]:
if "umbu" in tan_texts:
    print(1)

1


In [18]:
hate = []
c = 0
for t in range(len(tan_texts)):
    c += 1
    print(c)
    test_text = [tan_texts[t]]
    cleaned = [preprocess_text(text) for text in test_text]
    tokenized = tokenizer.tokenizer(cleaned)
    encoded = [tokenizer.sp.PieceToId(piece) for text in tokenized for piece in text]
    padded = pad_sequences([encoded],maxlen=70,padding="post")

    predictions = bilstm_model.predict(padded)
    predicted_labels = np.argmax(predictions,axis=1)

    if(predicted_labels == 1):
        hate.append(tan_texts[t])


1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 431ms/step
2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
11
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
12
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
13
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
14
[1m1/1[0m [32m━━━━━━━━━━━━━

In [19]:
t = ' '.join(hate)

In [20]:
t

'punda loosu koothi kuthi thevidiya pundai oombu umbu watha ootha otha chi ommala mental kevalama nak oomba koodhi thevudiya police gommala kiss payale thevidya lusu karumam that pombala kevalam podi mooditu fuck etha paithiyam'