# Dataset Loading


In [18]:
import pandas as pd
import numpy as np

In [2]:
from utils.text_cleaning_utils import TextCleaner

def preprocess_text_for_svm(df, text_column):
    """
    Preprocess text data for SVM model
    
    Parameters:
    -----------
    df : pandas DataFrame
        DataFrame containing the text data
    text_column : str
        Name of the column containing text to be processed
    
    Returns:
    --------
    pandas DataFrame
        DataFrame with preprocessed text and tokenized embeddings
    """
    # Initialize text cleaner
    text_cleaner = TextCleaner()
    
    # Apply text cleaning functions
    processed_text = (df[text_column]
                     .apply(text_cleaner.remove_digits)
                     .apply(text_cleaner.remove_english_and_special_chars)
                     .apply(text_cleaner.remove_stopwords)
                     .apply(text_cleaner.remove_emojis))
    
    # Create a copy of the dataframe to avoid SettingWithCopyWarning
    processed_df = df.copy()
    processed_df[text_column] = processed_text
    
    # Tokenize the clean text
    tokenizer = SentencepieceTokenizer()
    processed_df['tokenized_text'] = processed_df[text_column].apply(
        lambda x: tokenizer.tokenize(x) if isinstance(x, str) else []
    )
    
    # Get embeddings for each token
    fast_text_embedding = BengaliFasttext()
    processed_df['embeddings'] = processed_df['tokenized_text'].apply(
        lambda tokens: get_embeddings(tokens, fast_text_embedding)
    )
    
    return processed_df

cleaned_comments_dataset = pd.read_csv(
    "../dataset/cleaned_comments_dataset.csv",
    encoding="utf-8",
    engine="pyarrow",
)

only_augmented_comments_dataset = pd.read_csv(
    "../dataset/only_augmented_comments_dataset.csv",
    encoding="utf-8",
    engine="pyarrow",
)

In [3]:
label_encoding_dict = {
    "not bully": 0,
    "religious": 1,
    "troll": 2,
    "sexual": 3,
    "threat": 4,
}

cleaned_comments_dataset["label"] = cleaned_comments_dataset["label"].map(
    label_encoding_dict
)

only_augmented_comments_dataset["label"] = only_augmented_comments_dataset["label"].map(
    label_encoding_dict
)

In [4]:
cleaned_comments_dataset

Unnamed: 0,comment,label
0,হালার পুত মদ খাওয়ার সময় রাতের বেলা মদ খাই দি...,3
1,ঘরে শুট কেমন লেগেছে ক্যামেরাতে,0
2,অরে বাবা টা পাগল,0
3,ক্যাপ্টেন অফ বাংলাদেশ,0
4,পটকা মাছ,2
...,...,...
43562,হিরো আলম এগিয়ে যাও,0
43563,হিরো আলমকে সাপোর্ট অসংখ্য ধন্যবাদ আপনাকে,0
43564,হিরো ভাই এগিয়ে য়াও,0
43565,হুম ভাও তোমরা এগিয়ে যাও তোমাদের পিছনে আছি,0


In [5]:
only_augmented_comments_dataset

Unnamed: 0,text,label
0,খানকিরা জানে খানকি বিত্তি কিভাবে আল্লাহতালা নর...,1
1,তুই বড়যাত্রা এসেছ জুতা দেখেশিসতাহলে তর মাবাপের...,4
2,এক সাক্ষাৎকারে মেয়েটি বলেছে বিশ্বাস না। ইসলাম...,1
3,নাস্তিকের প্রার্থনা,1
4,আজকাল লম্পট অশিক্ষিত ব্যক্তি মুখ বের আনে শীঘ্র...,1
...,...,...
6076,দেশ রুপার বিতাড়িত,4
6077,জুতাটা কপালে মেরেছি।,4
6078,সেফু নামক এক ইসলাম বিদ্বেষীর কুরআন রাসুল সঃ অপ...,4
6079,জুতা মেরে গরু দান পুকি মেরে দিলা।,4


### Importing the Tokenization and Necessary Libraries


In [6]:
from bnlp import SentencepieceTokenizer
from bnlp.embedding.fasttext import BengaliFasttext
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [7]:
tokenizer = SentencepieceTokenizer()

In [8]:
fast_text_embedding = BengaliFasttext()

In [9]:
tokenized_original_comments = cleaned_comments_dataset["comment"].apply(
    lambda x: tokenizer.tokenize(x)
)

tokenized_original_comments

0        [▁হাল, ার, ▁, পুত, ▁মদ, ▁খাওয়ার, ▁সময়, ▁রাতে...
1              [▁ঘরে, ▁শুট, ▁কেমন, ▁লেগেছে, ▁ক্যামেরা, তে]
2                              [▁অর, ে, ▁বাবা, ▁টা, ▁পাগল]
3                             [▁ক্যাপ্টেন, ▁অফ, ▁বাংলাদেশ]
4                                          [▁পট, কা, ▁মাছ]
                               ...                        
43562                       [▁হিরো, ▁আলম, ▁এগিয়ে, ▁যা, ও]
43563    [▁হিরো, ▁আলম, কে, ▁সাপোর্ট, ▁অসংখ্য, ▁ধন্যবাদ,...
43564                    [▁হিরো, ▁ভাই, ▁এগিয়ে, ▁, য়া, ও]
43565    [▁হুম, ▁ভা, ও, ▁তোমরা, ▁এগিয়ে, ▁যা, ও, ▁তোমাদ...
43566                   [▁হ্যালো, ▁তোমাদের, ▁সাথে, ▁চ্যাট]
Name: comment, Length: 43567, dtype: object

In [10]:
tokenized_augmented_comments = only_augmented_comments_dataset["text"].apply(
    lambda x: tokenizer.tokenize(x)
)

tokenized_augmented_comments

0       [▁খান, কি, রা, ▁জানে, ▁খান, কি, ▁বি, ত্তি, ▁কি...
1       [▁তুই, ▁বড়, যাত্রা, ▁এসে, ছ, ▁জুতা, ▁দেখে, শি...
2       [▁এক, ▁সাক্ষাৎকারে, ▁মেয়েটি, ▁বলেছে, ▁বিশ্বাস...
3                              [▁নাস্তিক, ের, ▁প্রার্থনা]
4       [▁আজকাল, ▁লম্পট, ▁অশিক্ষিত, ▁ব্যক্তি, ▁মুখ, ▁ব...
                              ...                        
6076                            [▁দেশ, ▁রুপার, ▁বিতাড়িত]
6077                    [▁জুতা, টা, ▁কপালে, ▁মেরে, ছি, ।]
6078    [▁সে, ফু, ▁নামক, ▁এক, ▁ইসলাম, ▁বিদ্বেষ, ীর, ▁ক...
6079    [▁জুতা, ▁মেরে, ▁গরু, ▁দান, ▁পু, কি, ▁মেরে, ▁দি...
6080                       [▁মারু, ন, ▁রে, ▁জাহেদ, ▁জুতা]
Name: text, Length: 6081, dtype: object

In [None]:
def get_embeddings(tokens):
    # Initialize an empty list to store embeddings for each token
    embeddings = []

    # Get embedding for each token in the token list
    for token in tokens:
        try:
            embedding = fast_text_embedding.get_word_vector(token)
            embeddings.append(embedding)
        except:
            # If token doesn't have an embedding, skip it
            continue

    return embeddings

In [35]:
# Apply the function to get embeddings for augmented comments
embedding_set_of_augmented_comments = tokenized_augmented_comments.apply(get_embeddings)

print(embedding_set_of_augmented_comments.shape)

(6081,)


In [36]:
embedding_set_of_original_comments = tokenized_original_comments.apply(get_embeddings)

print(embedding_set_of_original_comments.shape)

(43567,)


# SVM Run on Original Dataset

### Train Test Split

In [38]:
train_X, test_X, train_y, test_y = train_test_split(
    embedding_set_of_original_comments,
    cleaned_comments_dataset["label"],
    test_size=0.2,
    random_state=42,
)

In [40]:
svm_model = SVC(kernel="linear")

In [45]:
# Convert list of token embeddings to a single fixed-length vector by taking the mean
def get_mean_embedding(embeddings_list):
    if not embeddings_list:  # Handle empty lists
        return np.zeros(100)  # Fasttext embeddings are of length 100
    return np.mean(embeddings_list, axis=0)

In [46]:
train_X_vectors = np.array([get_mean_embedding(embeddings) for embeddings in train_X])
test_X_vectors = np.array([get_mean_embedding(embeddings) for embeddings in test_X])

# Fit the model with the processed vectors
svm_model.fit(train_X_vectors, train_y)

In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns

import matplotlib.pyplot as plt

def evaluate_model_performance(model, X_test, y_test, class_names=None):
    """
    Evaluates and prints various performance metrics for a classification model
    
    Parameters:
    -----------
    model : trained classifier model with predict method
        The trained model to evaluate
    X_test : array-like
        Test features
    y_test : array-like
        True labels for test data
    class_names : list, optional
        Names of the classes (used for confusion matrix)
        
    Returns:
    --------
    dict
        Dictionary containing all calculated metrics
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Model Performance Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    if class_names is None:
        class_names = [f"Class {i}" for i in range(len(np.unique(y_test)))]
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, 
                yticklabels=class_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()
    
    # Return metrics as dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }

# Example usage:
# class_names = ["not bully", "religious", "troll", "sexual", "threat"]
# metrics = evaluate_model_performance(svm_model, test_X_vectors, test_y, class_names)

In [None]:
evaluate_model_performance(svm_model, test_X_vectors, test_y)