## **Pre Processing**

In [113]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

In [114]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [115]:
#telugu stop words
telugu_stopwords = set([
    "అందుకే", "ఎప్పుడు", "ఎక్కడ", "ఎవరూ", "ఎవరు", "ఎవరిది", "ఎలా", 
    "ఏ", "ఏది", "ఏడు", "కాదు", "కూడా", "తప్పుడు", "తరువాత", 
    "తర్వాత", "మరి", "మాత్రమే", "ముందు", "వీరు", "వెంటనే", "ఇంకా",
    "ఇది", "అది", "ఎప్పుడు", "నేను", "మీ", "మీరు", "అలాగే", "ముందుగా"
])


In [116]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove punctuations and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in telugu_stopwords]
    
    return ' '.join(tokens)


In [117]:
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')


In [118]:
# Binary classification based on 'AUDIO FILE NAME'
df['BINARY_LABEL'] = df['AUDIO FILE NAME'].apply(lambda x: 'H' if x.startswith('H') else 'NH')

# Print counts for each class
print("Binary Label Counts:")
print(df['BINARY_LABEL'].value_counts())


Binary Label Counts:
BINARY_LABEL
H     392
NH    209
Name: count, dtype: int64


In [119]:
# Apply preprocessing
df['processed_text'] = df['TRANSCRIPTION'].apply(preprocess_text)

print(df[['TRANSCRIPTION', 'processed_text']].head())

                                       TRANSCRIPTION  \
0               ఎస్సీలుగా పుట్టాలని ఎవరు కోరుకుంటారు   
1  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు అం...   
2        ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు   
3  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు డబ...   
4  అందరూ రాజుల కులంలో పుడితే రాజ్యాన్ని ఎలచ్చనుకు...   

                                      processed_text  
0                              ఎససలగ పటటలన ఎవర కరకటర  
1  ఎవర మతర sc కలల పటటలన కరకటర అదర సపనన వరగలలన పటట...  
2                         ఎవర మతర sc కలల పటటలన కరకటర  
3  ఎవర మతర sc కలల పటటలన కరకటర డబబల లకపత అదర సపనన ...  
4                     అదర రజల కలల పడత రజయనన ఎలచచనకటర  


##  **NGRAM**

In [120]:
import random
from sklearn.feature_extraction.text import CountVectorizer

# Define the vectorizer for unigrams, bigrams, and trigrams
vectorizer = CountVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(df['processed_text'])

# Get the n-gram features (vocabulary)
n_gram_features = vectorizer.get_feature_names_out()

# Get the total number of unique n-grams
total_unique_ngrams = len(n_gram_features)

# Print the total count of unique n-grams (vocabulary size)
print(f"Total unique n-grams (vocabulary size): {total_unique_ngrams}")

# Select 10 random n-grams from the vocabulary
random_ngrams = random.sample(list(n_gram_features), 10)

# Print the 10 random n-grams
print("Random sample of 10 n-grams:")
for ngram in random_ngrams:
    print(ngram)


Total unique n-grams (vocabulary size): 28832
Random sample of 10 n-grams:
టవ సనమల నషధ
పరమల మరచన
తవరగ చపతననవ
కన వనయ గపపద
నటసత గరలఫరడ దగతడ
సతర ఉనన
అనక నన
మర పటచకర
మద మణల వచచన
కటరకట పయన ఉట


## **Bag of Words**

In [121]:
# Check if there are any empty processed_text rows
empty_rows = df['processed_text'].apply(lambda x: len(x.strip()) == 0).sum()

print(f"Number of empty rows after preprocessing: {empty_rows}")

# Check the first few rows of processed_text to verify proper preprocessing
print(df['processed_text'].head())

# If everything looks good, use CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()

# Fit the vectorizer and transform the text data
bow_matrix = bow_vectorizer.fit_transform(df['processed_text'])

# If the matrix still has zeros, print the sum of each row
print(bow_matrix.toarray())

# Display any row sums that are 0 (meaning no words were found in that document)
row_sums = bow_matrix.toarray().sum(axis=1)
print(f"Rows with zero sum (no words found): {sum(row_sums == 0)}")


Number of empty rows after preprocessing: 0
0                                ఎససలగ పటటలన ఎవర కరకటర
1    ఎవర మతర sc కలల పటటలన కరకటర అదర సపనన వరగలలన పటట...
2                           ఎవర మతర sc కలల పటటలన కరకటర
3    ఎవర మతర sc కలల పటటలన కరకటర డబబల లకపత అదర సపనన ...
4                       అదర రజల కలల పడత రజయనన ఎలచచనకటర
Name: processed_text, dtype: object
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Rows with zero sum (no words found): 0


## **Word Tokenization**

In [122]:
# Tokenization using NLTK
df['tokenized_text'] = df['processed_text'].apply(word_tokenize)

print(df[['processed_text', 'tokenized_text']].head())


                                      processed_text  \
0                              ఎససలగ పటటలన ఎవర కరకటర   
1  ఎవర మతర sc కలల పటటలన కరకటర అదర సపనన వరగలలన పటట...   
2                         ఎవర మతర sc కలల పటటలన కరకటర   
3  ఎవర మతర sc కలల పటటలన కరకటర డబబల లకపత అదర సపనన ...   
4                     అదర రజల కలల పడత రజయనన ఎలచచనకటర   

                                      tokenized_text  
0                         [ఎససలగ, పటటలన, ఎవర, కరకటర]  
1  [ఎవర, మతర, sc, కలల, పటటలన, కరకటర, అదర, సపనన, వ...  
2                  [ఎవర, మతర, sc, కలల, పటటలన, కరకటర]  
3  [ఎవర, మతర, sc, కలల, పటటలన, కరకటర, డబబల, లకపత, ...  
4              [అదర, రజల, కలల, పడత, రజయనన, ఎలచచనకటర]  


## **FOR BINARY CLASSIFICATION(M-BERT)**

In [123]:
# Import necessary libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [124]:
from transformers import BertForSequenceClassification

# Define the model with 5 labels (Non-hate + 4 hate categories)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [125]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, binary=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.binary = binary  # Flag to indicate binary or multi-class

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Label: For binary, 'NH' as 0, 'H' as 1. Multi-class uses label mapping
        label_value = torch.tensor(1 if label == 'H' else 0, dtype=torch.long) if self.binary else torch.tensor(label, dtype=torch.long)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': label_value
        }


In [126]:
# Set up device for GPU if available, else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [127]:
# Binary classification model setup with two labels (Hate vs. Non-Hate)
binary_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
binary_model = binary_model.to(device)
binary_optimizer = torch.optim.AdamW(binary_model.parameters(), lr=1e-5)

# Split dataset for binary classification
binary_train_texts, binary_test_texts, binary_train_labels, binary_test_labels = train_test_split(df['TRANSCRIPTION'], df['BINARY_LABEL'], test_size=0.2)

# Binary Dataset and DataLoader
binary_train_dataset = HateSpeechDataset(binary_train_texts.tolist(), binary_train_labels.tolist(), tokenizer, max_len=128, binary=True)
binary_test_dataset = HateSpeechDataset(binary_test_texts.tolist(), binary_test_labels.tolist(), tokenizer, max_len=128, binary=True)

binary_train_loader = DataLoader(binary_train_dataset, batch_size=16, shuffle=True)
binary_test_loader = DataLoader(binary_test_dataset, batch_size=16)

# Binary Classification Training Loop
for epoch in range(25):  # You can adjust the number of epochs
    binary_model.train()
    for batch in binary_train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        binary_optimizer.zero_grad()
        outputs = binary_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        binary_optimizer.step()

    print(f"Binary Classification Epoch {epoch + 1} finished")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Binary Classification Epoch 1 finished
Binary Classification Epoch 2 finished
Binary Classification Epoch 3 finished
Binary Classification Epoch 4 finished
Binary Classification Epoch 5 finished
Binary Classification Epoch 6 finished
Binary Classification Epoch 7 finished
Binary Classification Epoch 8 finished
Binary Classification Epoch 9 finished
Binary Classification Epoch 10 finished
Binary Classification Epoch 11 finished
Binary Classification Epoch 12 finished
Binary Classification Epoch 13 finished
Binary Classification Epoch 14 finished
Binary Classification Epoch 15 finished
Binary Classification Epoch 16 finished
Binary Classification Epoch 17 finished
Binary Classification Epoch 18 finished
Binary Classification Epoch 19 finished
Binary Classification Epoch 20 finished
Binary Classification Epoch 21 finished
Binary Classification Epoch 22 finished
Binary Classification Epoch 23 finished
Binary Classification Epoch 24 finished
Binary Classification Epoch 25 finished


In [128]:
# Testing Binary Classification Model
binary_test_preds = []
binary_test_labels_eval = []

binary_model.eval()
with torch.no_grad():
    for batch in binary_test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = binary_model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        binary_test_preds.extend(preds)
        binary_test_labels_eval.extend(labels.cpu().numpy())

# Evaluate binary accuracy
binary_accuracy = accuracy_score(binary_test_labels_eval, binary_test_preds)
print(f"Binary Classification Accuracy: {binary_accuracy}")
print(classification_report(binary_test_labels_eval, binary_test_preds, target_names=['Non-Hate', 'Hate']))


Binary Classification Accuracy: 0.8512396694214877
              precision    recall  f1-score   support

    Non-Hate       0.72      0.80      0.76        35
        Hate       0.91      0.87      0.89        86

    accuracy                           0.85       121
   macro avg       0.82      0.84      0.82       121
weighted avg       0.86      0.85      0.85       121



## **MULTI-CLASS(4HATE AND 1 NON-HATE) M-BERT**

In [129]:
# No need for binary label anymore. Use 'SHORT LABLE' as is.
# Clean the 'SHORT LABLE' column for binary classification
df['SHORT LABLE'] = df['SHORT LABLE'].str.strip()  # Remove extra spaces

# Binary classification: 'H' (Hate) for labels ['C', 'G', 'P', 'R'], 'N' (Non-hate) otherwise
df['BINARY_LABEL'] = df['SHORT LABLE'].apply(lambda x: 'H' if x in ['C', 'G', 'P', 'R'] else 'N')


In [130]:
non_hate_count = df[df['SHORT LABLE'] == 'N'].shape[0]  # Count non-hate
hate_count = df[df['SHORT LABLE'].isin(['C', 'G', 'P', 'R'])].shape[0]  # Count hate categories

# Print the counts
print(f"Number of Non-hate entries: {non_hate_count}")
print(f"Number of Hate entries (C, G, P, R): {hate_count}")

Number of Non-hate entries: 208
Number of Hate entries (C, G, P, R): 393


In [131]:
# Count each hate category separately
char_assassination_count = df[df['SHORT LABLE'] == 'C'].shape[0]  # Character Assassination
gender_count = df[df['SHORT LABLE'] == 'G'].shape[0]  # Gender/Sex based
political_count = df[df['SHORT LABLE'] == 'P'].shape[0]  # Political
religion_count = df[df['SHORT LABLE'] == 'R'].shape[0]  # Religion

print(f"Number of Character Assassination entries (C): {char_assassination_count}")
print(f"Number of Gender/Sex based entries (G): {gender_count}")
print(f"Number of Political entries (P): {political_count}")
print(f"Number of Religion entries (R): {religion_count}")


Number of Character Assassination entries (C): 132
Number of Gender/Sex based entries (G): 111
Number of Political entries (P): 68
Number of Religion entries (R): 82


In [132]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)  # Label is now an integer for 5 classes
        }


In [133]:
from transformers import BertForSequenceClassification

# Define the model with 5 labels (Non-hate + 4 hate categories)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=5)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [134]:
# Set up optimizer and device
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [135]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2)

# Define label mapping for all 5 categories
label_mapping = {
    'N': 0,  # Non-hate
    'C': 1,  # Character Assassination
    'G': 2,  # Gender/Sex based
    'P': 3,  # Political
    'R': 4   # Religion
}

# Convert labels to integers for training
train_labels = train_labels.map(label_mapping)
test_labels = test_labels.map(label_mapping)

# Prepare train and test datasets using the HateSpeechDataset class
train_dataset = HateSpeechDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=128)
test_dataset = HateSpeechDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_len=128)


In [136]:
# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [137]:
# Training loop
for epoch in range(25):  # Train for 3 epochs
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} finished")


Epoch 1 finished
Epoch 2 finished
Epoch 3 finished
Epoch 4 finished
Epoch 5 finished
Epoch 6 finished
Epoch 7 finished
Epoch 8 finished
Epoch 9 finished
Epoch 10 finished
Epoch 11 finished
Epoch 12 finished
Epoch 13 finished
Epoch 14 finished
Epoch 15 finished
Epoch 16 finished
Epoch 17 finished
Epoch 18 finished
Epoch 19 finished
Epoch 20 finished
Epoch 21 finished
Epoch 22 finished
Epoch 23 finished
Epoch 24 finished
Epoch 25 finished


In [138]:
from sklearn.metrics import accuracy_score, classification_report

# Reverse label mapping: convert integers back to original labels
label_mapping_reverse = {0: 'N', 1: 'C', 2: 'G', 3: 'P', 4: 'R'}

# Testing loop: Generate test_preds and test_labels_eval
test_preds = []
test_labels_eval = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        test_preds.extend(preds)
        test_labels_eval.extend(labels.cpu().numpy())

# Convert integer labels back to original string labels for test data
test_preds = [label_mapping_reverse[pred] for pred in test_preds]
test_labels_eval = [label_mapping_reverse[label] for label in test_labels_eval]

# Check unique classes in the true labels and predictions
print(f"Unique classes in test_labels_eval: {set(test_labels_eval)}")
print(f"Unique classes in test_preds: {set(test_preds)}")

# Define the expected labels (all 5 classes)
expected_labels = ['N', 'C', 'G', 'P', 'R']

# Evaluate accuracy on testing data
test_accuracy = accuracy_score(test_labels_eval, test_preds)
print(f'Test Data Accuracy: {test_accuracy}')

# Generate classification report and handle missing classes
print(classification_report(test_labels_eval, test_preds, labels=expected_labels, target_names=expected_labels))

Unique classes in test_labels_eval: {'G', 'N', 'C', 'P', 'R'}
Unique classes in test_preds: {'G', 'N', 'C', 'P', 'R'}
Test Data Accuracy: 0.7603305785123967
              precision    recall  f1-score   support

           N       0.74      0.84      0.79        38
           C       0.81      0.76      0.79        29
           G       0.67      0.64      0.65        22
           P       0.86      0.67      0.75        18
           R       0.75      0.86      0.80        14

    accuracy                           0.76       121
   macro avg       0.77      0.75      0.76       121
weighted avg       0.76      0.76      0.76       121



### **FOR BINARY CLASSIFICATION(XLM-Roberta)**

In [139]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [140]:
# Load your dataset from an Excel file
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')

# Binary classification based on 'AUDIO FILE NAME'
df['BINARY_LABEL'] = df['AUDIO FILE NAME'].apply(lambda x: 'H' if x.startswith('H') else 'NH')

# Print counts for each class
print("Binary Label Counts:")
print(df['BINARY_LABEL'].value_counts())

Binary Label Counts:
BINARY_LABEL
H     392
NH    209
Name: count, dtype: int64


In [141]:
from transformers import XLMRobertaTokenizer, BertForSequenceClassification

# Define the model with 5 labels (Non-hate + 4 hate categories)
model = BertForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=5)
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

You are using a model of type xlm-roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.l

In [142]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, binary=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.binary = binary  # Flag to indicate binary or multi-class

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Label: For binary, 'NH' as 0, 'H' as 1. Multi-class uses label mapping
        label_value = torch.tensor(1 if label == 'H' else 0, dtype=torch.long) if self.binary else torch.tensor(label, dtype=torch.long)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': label_value
        }


In [143]:
# Set up device for GPU if available, else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [144]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Define the tokenizer and model for binary classification (Hate vs. Non-Hate)
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
binary_model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)
binary_model = binary_model.to(device)

# Set up the optimizer
binary_optimizer = torch.optim.AdamW(binary_model.parameters(), lr=1e-5)

# Split dataset for binary classification
binary_train_texts, binary_test_texts, binary_train_labels, binary_test_labels = train_test_split(
    df['TRANSCRIPTION'], df['BINARY_LABEL'], test_size=0.2
)

# Binary Dataset and DataLoader
binary_train_dataset = HateSpeechDataset(binary_train_texts.tolist(), binary_train_labels.tolist(), tokenizer, max_len=128, binary=True)
binary_test_dataset = HateSpeechDataset(binary_test_texts.tolist(), binary_test_labels.tolist(), tokenizer, max_len=128, binary=True)

binary_train_loader = DataLoader(binary_train_dataset, batch_size=16, shuffle=True)
binary_test_loader = DataLoader(binary_test_dataset, batch_size=16)



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [145]:
# Binary Classification Training Loop
for epoch in range(25):  # You can adjust the number of epochs
    binary_model.train()
    for batch in binary_train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        binary_optimizer.zero_grad()
        outputs = binary_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        binary_optimizer.step()

    print(f"Epoch {epoch + 1}")

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25


In [146]:
# Testing Binary Classification Model
binary_test_preds = []
binary_test_labels_eval = []

binary_model.eval()
with torch.no_grad():
    for batch in binary_test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = binary_model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        binary_test_preds.extend(preds)
        binary_test_labels_eval.extend(labels.cpu().numpy())

# Evaluate binary accuracy
binary_accuracy = accuracy_score(binary_test_labels_eval, binary_test_preds)
print(f"Binary Classification Accuracy: {binary_accuracy}")
print(classification_report(binary_test_labels_eval, binary_test_preds, target_names=['Non-Hate', 'Hate']))


Binary Classification Accuracy: 0.9008264462809917
              precision    recall  f1-score   support

    Non-Hate       0.84      0.88      0.86        41
        Hate       0.94      0.91      0.92        80

    accuracy                           0.90       121
   macro avg       0.89      0.90      0.89       121
weighted avg       0.90      0.90      0.90       121



## **MULTI-CLASS(4HATE AND 1 NON-HATE) XLM-RoBERT**

In [147]:
# No need for binary label anymore. Use 'SHORT LABLE' as is.
# Clean the 'SHORT LABLE' column for binary classification
df['SHORT LABLE'] = df['SHORT LABLE'].str.strip()  # Remove extra spaces

# Binary classification: 'H' (Hate) for labels ['C', 'G', 'P', 'R'], 'N' (Non-hate) otherwise
df['BINARY_LABEL'] = df['SHORT LABLE'].apply(lambda x: 'H' if x in ['C', 'G', 'P', 'R'] else 'N')


In [148]:
non_hate_count = df[df['SHORT LABLE'] == 'N'].shape[0]  # Count non-hate
hate_count = df[df['SHORT LABLE'].isin(['C', 'G', 'P', 'R'])].shape[0]  # Count hate categories

# Print the counts
print(f"Number of Non-hate entries: {non_hate_count}")
print(f"Number of Hate entries (C, G, P, R): {hate_count}")

Number of Non-hate entries: 208
Number of Hate entries (C, G, P, R): 393


In [149]:
# Count each hate category separately
char_assassination_count = df[df['SHORT LABLE'] == 'C'].shape[0]  # Character Assassination
gender_count = df[df['SHORT LABLE'] == 'G'].shape[0]  # Gender/Sex based
political_count = df[df['SHORT LABLE'] == 'P'].shape[0]  # Political
religion_count = df[df['SHORT LABLE'] == 'R'].shape[0]  # Religion

print(f"Number of Character Assassination entries (C): {char_assassination_count}")
print(f"Number of Gender/Sex based entries (G): {gender_count}")
print(f"Number of Political entries (P): {political_count}")
print(f"Number of Religion entries (R): {religion_count}")


Number of Character Assassination entries (C): 132
Number of Gender/Sex based entries (G): 111
Number of Political entries (P): 68
Number of Religion entries (R): 82


In [150]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)  # Label is now an integer for 5 classes
        }


In [151]:
from transformers import BertForSequenceClassification

# Define the model with 5 labels (Non-hate + 4 hate categories)
model = BertForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=5)
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')


You are using a model of type xlm-roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.l

In [152]:
# Set up optimizer and device
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [153]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2)

# Define label mapping for all 5 categories
label_mapping = {
    'N': 0,  # Non-hate
    'C': 1,  # Character Assassination
    'G': 2,  # Gender/Sex based
    'P': 3,  # Political
    'R': 4   # Religion
}

# Convert labels to integers for training
train_labels = train_labels.map(label_mapping)
test_labels = test_labels.map(label_mapping)

# Prepare train and test datasets using the HateSpeechDataset class
train_dataset = HateSpeechDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=128)
test_dataset = HateSpeechDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, max_len=128)


In [154]:
# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [155]:
# Training loop
for epoch in range(25):  
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} finished")


Epoch 1 finished
Epoch 2 finished
Epoch 3 finished
Epoch 4 finished
Epoch 5 finished
Epoch 6 finished
Epoch 7 finished
Epoch 8 finished
Epoch 9 finished
Epoch 10 finished
Epoch 11 finished
Epoch 12 finished
Epoch 13 finished
Epoch 14 finished
Epoch 15 finished
Epoch 16 finished
Epoch 17 finished
Epoch 18 finished
Epoch 19 finished
Epoch 20 finished
Epoch 21 finished
Epoch 22 finished
Epoch 23 finished
Epoch 24 finished
Epoch 25 finished


In [156]:
from sklearn.metrics import accuracy_score, classification_report

# Reverse label mapping: convert integers back to original labels
label_mapping_reverse = {0: 'N', 1: 'C', 2: 'G', 3: 'P', 4: 'R'}

# Testing loop: Generate test_preds and test_labels_eval
test_preds = []
test_labels_eval = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        test_preds.extend(preds)
        test_labels_eval.extend(labels.cpu().numpy())

# Convert integer labels back to original string labels for test data
test_preds = [label_mapping_reverse[pred] for pred in test_preds]
test_labels_eval = [label_mapping_reverse[label] for label in test_labels_eval]

# Check unique classes in the true labels and predictions
print(f"Unique classes in test_labels_eval: {set(test_labels_eval)}")
print(f"Unique classes in test_preds: {set(test_preds)}")

# Define the expected labels (all 5 classes)
expected_labels = ['N', 'C', 'G', 'P', 'R']

# Evaluate accuracy on testing data
test_accuracy = accuracy_score(test_labels_eval, test_preds)
print(f'Test Data Accuracy: {test_accuracy}')

# Generate classification report and handle missing classes
print(classification_report(test_labels_eval, test_preds, labels=expected_labels, target_names=expected_labels))

Unique classes in test_labels_eval: {'G', 'N', 'C', 'P', 'R'}
Unique classes in test_preds: {'G', 'N', 'C', 'P', 'R'}
Test Data Accuracy: 0.6446280991735537
              precision    recall  f1-score   support

           N       0.64      0.85      0.73        46
           C       0.71      0.81      0.76        21
           G       0.62      0.53      0.57        19
           P       0.53      0.57      0.55        14
           R       0.80      0.19      0.31        21

    accuracy                           0.64       121
   macro avg       0.66      0.59      0.58       121
weighted avg       0.66      0.64      0.62       121



### ****BINARY CLASSIFICATION(XGBOOST with BERT)****

In [157]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [158]:
# Load your dataset from an Excel file
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')  # Adjust path as necessary

# Create binary labels from AUDIO FILE NAME (Hate vs. Non-hate)
df['BINARY_LABEL'] = df['AUDIO FILE NAME'].apply(lambda x: 1 if x.startswith('H_') else 0)

# Split dataset for binary classification
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['BINARY_LABEL'], test_size=0.2, random_state=42)


In [159]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')


In [160]:
# Function to extract BERT embeddings for each sentence
def extract_bert_embeddings(texts, tokenizer, model, max_len=128):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in texts:
            # Tokenize and encode the text for BERT
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_len)
            outputs = model(**inputs)
            # CLS token representation as the sentence embedding
            embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
    return np.concatenate(embeddings, axis=0)

# Extract BERT embeddings for train and test sets
train_embeddings = extract_bert_embeddings(train_texts.tolist(), tokenizer, bert_model)
test_embeddings = extract_bert_embeddings(test_texts.tolist(), tokenizer, bert_model)


In [161]:
# Initialize XGBoost classifier for binary classification
xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss')

# Train XGBoost on the BERT embeddings
xgb_model.fit(train_embeddings, train_labels)


In [162]:
# Predict on the test embeddings
test_preds = xgb_model.predict(test_embeddings)

# Evaluate the model performance
accuracy = accuracy_score(test_labels, test_preds)
report = classification_report(test_labels, test_preds, target_names=["Non-Hate", "Hate"])

# Print accuracy and classification report
print(f"Test Accuracy: {accuracy}")
print(report)


Test Accuracy: 0.7933884297520661
              precision    recall  f1-score   support

    Non-Hate       0.74      0.62      0.68        42
        Hate       0.81      0.89      0.85        79

    accuracy                           0.79       121
   macro avg       0.78      0.75      0.76       121
weighted avg       0.79      0.79      0.79       121



## **MULTI-CLASS(4HATE AND 1 NON-HATE) XLM-RoBERT**

In [163]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [164]:
# Define multiclass labels based on 'SHORT LABLE' column
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}  # Modify as needed for your classes
df['MULTI_LABEL'] = df['SHORT LABLE'].map(label_mapping)

# Split dataset for multiclass classification
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['MULTI_LABEL'], test_size=0.2, random_state=42)

In [165]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')


In [166]:
# Function to extract BERT embeddings for each sentence
def extract_bert_embeddings(texts, tokenizer, model, max_len=128):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for text in texts:
            # Tokenize and encode the text for BERT
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_len)
            outputs = model(**inputs)
            # CLS token representation as the sentence embedding
            embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
    return np.concatenate(embeddings, axis=0)

# Extract BERT embeddings for train and test sets
train_embeddings = extract_bert_embeddings(train_texts.tolist(), tokenizer, bert_model)
test_embeddings = extract_bert_embeddings(test_texts.tolist(), tokenizer, bert_model)


In [167]:
# Initialize XGBoost classifier for multiclass classification
xgb_model = XGBClassifier(n_estimators=500, use_label_encoder=False, eval_metric='mlogloss', objective='multi:softmax', num_class=len(label_mapping))

# Train XGBoost on the BERT embeddings
xgb_model.fit(train_embeddings, train_labels)


In [168]:
# Predict on the test embeddings
test_preds = xgb_model.predict(test_embeddings)

# Evaluate the model performance
accuracy = accuracy_score(test_labels, test_preds)
report = classification_report(test_labels, test_preds, target_names=list(label_mapping.keys()))

# Print accuracy and classification report
print(f"Test Accuracy: {accuracy}")
print(report)


Test Accuracy: 0.5537190082644629
              precision    recall  f1-score   support

           N       0.55      0.74      0.63        42
           C       0.54      0.64      0.58        22
           G       0.44      0.35      0.39        23
           P       0.75      0.40      0.52        15
           R       0.62      0.42      0.50        19

    accuracy                           0.55       121
   macro avg       0.58      0.51      0.53       121
weighted avg       0.56      0.55      0.54       121



In [193]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load dataset and set up binary labels
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')
df['BINARY_LABEL'] = df['AUDIO FILE NAME'].apply(lambda x: 'H' if x.startswith('H') else 'NH')

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['TRANSCRIPTION']).toarray()
y = df['BINARY_LABEL'].map({'NH': 0, 'H': 1})

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SVM Model Training
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = svm_model.predict(X_test)
print(f"SVM Binary Classification Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=['Non-Hate', 'Hate']))

# Transformer-based binary classification using XLM-Roberta
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification




SVM Binary Classification Accuracy: 0.768595041322314
              precision    recall  f1-score   support

    Non-Hate       0.67      0.60      0.63        40
        Hate       0.81      0.85      0.83        81

    accuracy                           0.77       121
   macro avg       0.74      0.73      0.73       121
weighted avg       0.76      0.77      0.77       121



In [187]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assuming df is your DataFrame with columns "TRANSCRIPTION" and "SHORT LABLE"
# Define label mapping for all 5 categories
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}
df['SHORT LABLE'] = df['SHORT LABLE'].map(label_mapping)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# SVM Classifier
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_tfidf, train_labels)

# Predict and Evaluate
tfidf_svm_preds = svm_model.predict(X_test_tfidf)
tfidf_svm_accuracy = accuracy_score(test_labels, tfidf_svm_preds)
print(f"TF-IDF + SVM Accuracy: {tfidf_svm_accuracy}")
print(classification_report(test_labels, tfidf_svm_preds, target_names=label_mapping.keys()))


TF-IDF + SVM Accuracy: 0.5454545454545454
              precision    recall  f1-score   support

           N       0.50      0.88      0.64        42
           C       0.55      0.55      0.55        22
           G       0.64      0.39      0.49        23
           P       0.67      0.13      0.22        15
           R       0.75      0.32      0.44        19

    accuracy                           0.55       121
   macro avg       0.62      0.45      0.47       121
weighted avg       0.60      0.55      0.51       121



In [194]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset and set up binary labels
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')
df['BINARY_LABEL'] = df['AUDIO FILE NAME'].apply(lambda x: 'H' if x.startswith('H') else 'NH')

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['TRANSCRIPTION']).toarray()
y = df['BINARY_LABEL'].map({'NH': 0, 'H': 1})

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model Training
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = log_reg_model.predict(X_test)
print(f"Logistic Regression Binary Classification Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=['Non-Hate', 'Hate']))


Logistic Regression Binary Classification Accuracy: 0.7520661157024794
              precision    recall  f1-score   support

    Non-Hate       0.88      0.33      0.48        42
        Hate       0.73      0.97      0.84        79

    accuracy                           0.75       121
   macro avg       0.80      0.65      0.66       121
weighted avg       0.78      0.75      0.71       121



In [195]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Model Training
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = rf_model.predict(X_test)
print(f"Random Forest Binary Classification Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=['Non-Hate', 'Hate']))


Random Forest Binary Classification Accuracy: 0.8099173553719008
              precision    recall  f1-score   support

    Non-Hate       0.95      0.48      0.63        42
        Hate       0.78      0.99      0.87        79

    accuracy                           0.81       121
   macro avg       0.87      0.73      0.75       121
weighted avg       0.84      0.81      0.79       121



In [196]:
from sklearn.neighbors import KNeighborsClassifier

# K-Nearest Neighbors Model Training
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = knn_model.predict(X_test)
print(f"K-Nearest Neighbors Binary Classification Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=['Non-Hate', 'Hate']))


K-Nearest Neighbors Binary Classification Accuracy: 0.7272727272727273
              precision    recall  f1-score   support

    Non-Hate       0.91      0.24      0.38        42
        Hate       0.71      0.99      0.83        79

    accuracy                           0.73       121
   macro avg       0.81      0.61      0.60       121
weighted avg       0.78      0.73      0.67       121



In [199]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Model Training
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = dt_model.predict(X_test)
print(f"Decision Tree Binary Classification Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=['Non-Hate', 'Hate']))


Decision Tree Binary Classification Accuracy: 0.7520661157024794
              precision    recall  f1-score   support

    Non-Hate       0.70      0.50      0.58        42
        Hate       0.77      0.89      0.82        79

    accuracy                           0.75       121
   macro avg       0.73      0.69      0.70       121
weighted avg       0.75      0.75      0.74       121

[CV] END svm__C=1, svm__degree=3, svm__gamma=auto, svm__kernel=rbf, tfidf__max_features=10000, tfidf__ngram_range=(1, 2); total time=   0.1s
[CV] END svm__C=1, svm__degree=3, svm__gamma=auto, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   0.1s
[CV] END svm__C=1, svm__degree=3, svm__gamma=auto, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   0.1s
[CV] END svm__C=1, svm__degree=3, svm__gamma=auto, svm__kernel=poly, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=   0.1s
[CV] END svm__C=1, svm__degree=3, svm__gamma

In [201]:
# Define label mapping for all 5 categories
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}

# Ensure the column name is correct
df['SHORT LABLE'] = df['SHORT LABLE'].map(label_mapping)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# SVM Classifier
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_tfidf, train_labels)

# Predict and Evaluate
tfidf_svm_preds = svm_model.predict(X_test_tfidf)
tfidf_svm_accuracy = accuracy_score(test_labels, tfidf_svm_preds)
print(f"TF-IDF + SVM Accuracy: {tfidf_svm_accuracy}")
print(classification_report(test_labels, tfidf_svm_preds, target_names=label_mapping.keys()))


TF-IDF + SVM Accuracy: 0.5454545454545454
              precision    recall  f1-score   support

           N       0.50      0.88      0.64        42
           C       0.55      0.55      0.55        22
           G       0.64      0.39      0.49        23
           P       0.67      0.13      0.22        15
           R       0.75      0.32      0.44        19

    accuracy                           0.55       121
   macro avg       0.62      0.45      0.47       121
weighted avg       0.60      0.55      0.51       121



In [208]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset and apply label mapping for multi-class classification
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}
df['SHORT LABLE'] = df['SHORT LABLE'].map(label_mapping)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# Logistic Regression Classifier
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train_tfidf, train_labels)

# Predict and Evaluate
tfidf_log_reg_preds = log_reg_model.predict(X_test_tfidf)
tfidf_log_reg_accuracy = accuracy_score(test_labels, tfidf_log_reg_preds)
print(f"TF-IDF + Logistic Regression Accuracy: {tfidf_log_reg_accuracy}")
print(classification_report(test_labels, tfidf_log_reg_preds, target_names=label_mapping.keys()))


TF-IDF + Logistic Regression Accuracy: 0.49586776859504134
              precision    recall  f1-score   support

           N       0.43      0.95      0.59        42
           C       0.65      0.50      0.56        22
           G       0.75      0.26      0.39        23
           P       1.00      0.07      0.12        15
           R       1.00      0.11      0.19        19

    accuracy                           0.50       121
   macro avg       0.77      0.38      0.37       121
weighted avg       0.69      0.50      0.43       121



In [209]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset and apply label mapping for multi-class classification
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}
df['SHORT LABLE'] = df['SHORT LABLE'].map(label_mapping)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, train_labels)

# Predict and Evaluate
tfidf_rf_preds = rf_model.predict(X_test_tfidf)
tfidf_rf_accuracy = accuracy_score(test_labels, tfidf_rf_preds)
print(f"TF-IDF + Random Forest Accuracy: {tfidf_rf_accuracy}")
print(classification_report(test_labels, tfidf_rf_preds, target_names=label_mapping.keys()))


TF-IDF + Random Forest Accuracy: 0.4793388429752066
              precision    recall  f1-score   support

           N       0.65      0.74      0.69        42
           C       0.31      0.64      0.42        22
           G       0.38      0.35      0.36        23
           P       0.67      0.13      0.22        15
           R       0.75      0.16      0.26        19

    accuracy                           0.48       121
   macro avg       0.55      0.40      0.39       121
weighted avg       0.55      0.48      0.45       121



In [211]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset and apply label mapping for multi-class classification
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}
df['SHORT LABLE'] = df['SHORT LABLE'].map(label_mapping)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_tfidf, train_labels)

# Predict and Evaluate
tfidf_knn_preds = knn_model.predict(X_test_tfidf)
tfidf_knn_accuracy = accuracy_score(test_labels, tfidf_knn_preds)
print(f"TF-IDF + K-Nearest Neighbors Accuracy: {tfidf_knn_accuracy}")
print(classification_report(test_labels, tfidf_knn_preds, target_names=label_mapping.keys()))


TF-IDF + K-Nearest Neighbors Accuracy: 0.4132231404958678
              precision    recall  f1-score   support

           N       0.69      0.52      0.59        42
           C       0.26      0.50      0.34        22
           G       0.37      0.61      0.46        23
           P       0.33      0.13      0.19        15
           R       0.33      0.05      0.09        19

    accuracy                           0.41       121
   macro avg       0.40      0.36      0.34       121
weighted avg       0.45      0.41      0.39       121



In [212]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset and apply label mapping for multi-class classification
df = pd.read_excel('/kaggle/input/dataset1/TELUGU_METADATA.xlsx')
label_mapping = {'N': 0, 'C': 1, 'G': 2, 'P': 3, 'R': 4}
df['SHORT LABLE'] = df['SHORT LABLE'].map(label_mapping)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['TRANSCRIPTION'], df['SHORT LABLE'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, train_labels)

# Predict and Evaluate
tfidf_dt_preds = dt_model.predict(X_test_tfidf)
tfidf_dt_accuracy = accuracy_score(test_labels, tfidf_dt_preds)
print(f"TF-IDF + Decision Tree Accuracy: {tfidf_dt_accuracy}")
print(classification_report(test_labels, tfidf_dt_preds, target_names=label_mapping.keys()))


TF-IDF + Decision Tree Accuracy: 0.32231404958677684
              precision    recall  f1-score   support

           N       0.52      0.36      0.42        42
           C       0.25      0.55      0.34        22
           G       0.35      0.30      0.33        23
           P       0.07      0.07      0.07        15
           R       0.40      0.21      0.28        19

    accuracy                           0.32       121
   macro avg       0.32      0.30      0.29       121
weighted avg       0.36      0.32      0.32       121

