Language-agnostic BERT Sentence Embedding (LaBSE)

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.optim as optim

In [2]:
# ——— Project-specific hyperparameters ———
MODEL_NAME    = "sentence-transformers/LaBSE"
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}  # MODIFIED

# 1. Load & preprocess data
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Translated_English', 'Label']]  # MODIFIED ("Lable" → "Label")
df = df.dropna().reset_index(drop=True)
# Keep only Positive, Negative, Neutral
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)  # MODIFIED
df["Label"] = df["Label"].map(LABEL_MAP)

# 2. Train/test split (stratify to keep balance)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# 3. Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts     = texts
        self.labels    = labels
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = int(self.labels[idx])
        enc   = self.tokenizer(
            text,
            padding     = "max_length",
            truncation  = True,
            max_length  = self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer, Dataloaders
tokenizer    = AutoTokenizer.from_pretrained(MODEL_NAME)
train_ds     = SentimentDataset(
    texts=train_df["Translated_English"].tolist(),
    labels=train_df["Label"].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
test_ds      = SentimentDataset(
    texts=test_df["Translated_English"].tolist(),
    labels=test_df["Label"].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 5. Model + classification head
base_model = AutoModel.from_pretrained(MODEL_NAME)
class Classifier(nn.Module):
    def __init__(self, base, hidden_size, n_classes):
        super().__init__()
        self.base   = base
        self.drop   = nn.Dropout(0.3)
        self.linear = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.pooler_output
        dropped = self.drop(pooled)
        return self.linear(dropped)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = Classifier(base_model, base_model.config.hidden_size, n_classes=3).to(device)  # MODIFIED

# 6. Optimizer, scheduler, loss
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps  = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)
scheduler = optim.lr_scheduler.LinearLR(
    optimizer, start_factor=0.1, total_iters=warmup_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

# 7. Training loop
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    correct    = 0

    for batch in train_loader:
        optimizer.zero_grad()
        ids    = batch["input_ids"].to(device)
        mask   = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(ids, mask)
        loss   = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc      = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — loss: {avg_loss:.4f}, acc: {acc:.4f}")

# 8. Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        ids    = batch["input_ids"].to(device)
        mask   = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(ids, mask)
        preds  = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))  # MODIFIED

cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])  # MODIFIED
print("\nConfusion Matrix:\n", cm)


2025-06-11 10:04:45.153688: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 10:04:45.165411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749616485.180106  705022 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749616485.184346  705022 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749616485.194894  705022 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/10 — loss: 0.9844, acc: 0.4820
Epoch 2/10 — loss: 0.5868, acc: 0.7638
Epoch 3/10 — loss: 0.3672, acc: 0.8707
Epoch 4/10 — loss: 0.1878, acc: 0.9466
Epoch 5/10 — loss: 0.0771, acc: 0.9786
Epoch 6/10 — loss: 0.0302, acc: 0.9942
Epoch 7/10 — loss: 0.0189, acc: 0.9971
Epoch 8/10 — loss: 0.0406, acc: 0.9883
Epoch 9/10 — loss: 0.0210, acc: 0.9942
Epoch 10/10 — loss: 0.0047, acc: 1.0000

Test Accuracy: 0.7984496124031008

Classification Report:
              precision    recall  f1-score   support

    Negative       0.73      0.81      0.77        85
     Neutral       0.83      0.80      0.81        85
    Positive       0.85      0.78      0.82        88

    accuracy                           0.80       258
   macro avg       0.80      0.80      0.80       258
weighted avg       0.80      0.80      0.80       258


Confusion Matrix:
 [[69  9  7]
 [12 68  5]
 [14  5 69]]


### telugu

In [5]:
# ——— Project-specific hyperparameters ———
MODEL_NAME    = "sentence-transformers/LaBSE"
MAX_LEN       = 128
BATCH_SIZE    = 32
LEARNING_RATE = 3e-5
WEIGHT_DECAY  = 0.01
EPOCHS        = 10
WARMUP_RATIO  = 0.1
LABEL_MAP     = {"Negative": 0, "Neutral": 1, "Positive": 2}  # MODIFIED

# 1. Load & preprocess data
df = pd.read_excel('/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx')[['Statement', 'Label']]  # MODIFIED ("Lable" → "Label")
df = df.dropna().reset_index(drop=True)
# Keep only Positive, Negative, Neutral
df = df[df["Label"].isin(LABEL_MAP)].reset_index(drop=True)  # MODIFIED
df["Label"] = df["Label"].map(LABEL_MAP)

# 2. Train/test split (stratify to keep balance)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["Label"],
    random_state=42
)

# 3. Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts     = texts
        self.labels    = labels
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = int(self.labels[idx])
        enc   = self.tokenizer(
            text,
            padding     = "max_length",
            truncation  = True,
            max_length  = self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids":      enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels":         torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer, Dataloaders
tokenizer    = AutoTokenizer.from_pretrained(MODEL_NAME)
train_ds     = SentimentDataset(
    texts=train_df["Statement"].tolist(),
    labels=train_df["Label"].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
test_ds      = SentimentDataset(
    texts=test_df["Statement"].tolist(),
    labels=test_df["Label"].tolist(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 5. Model + classification head
base_model = AutoModel.from_pretrained(MODEL_NAME)
class Classifier(nn.Module):
    def __init__(self, base, hidden_size, n_classes):
        super().__init__()
        self.base   = base
        self.drop   = nn.Dropout(0.3)
        self.linear = nn.Linear(hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.pooler_output
        dropped = self.drop(pooled)
        return self.linear(dropped)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = Classifier(base_model, base_model.config.hidden_size, n_classes=3).to(device)  # MODIFIED

# 6. Optimizer, scheduler, loss
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps  = len(train_loader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)
scheduler = optim.lr_scheduler.LinearLR(
    optimizer, start_factor=0.1, total_iters=warmup_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

# 7. Training loop
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0
    correct    = 0

    for batch in train_loader:
        optimizer.zero_grad()
        ids    = batch["input_ids"].to(device)
        mask   = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(ids, mask)
        loss   = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    acc      = correct / len(train_ds)
    print(f"Epoch {epoch}/{EPOCHS} — loss: {avg_loss:.4f}, acc: {acc:.4f}")

# 8. Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        ids    = batch["input_ids"].to(device)
        mask   = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(ids, mask)
        preds  = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nTest Accuracy:", accuracy_score(all_labels, all_preds))
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Neutral", "Positive"]))  # MODIFIED

cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])  # MODIFIED
print("\nConfusion Matrix:\n", cm)


2025-06-11 12:03:15.769249: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 12:03:15.782932: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749623595.800330  711116 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749623595.804679  711116 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749623595.815838  711116 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/10 — loss: 0.9852, acc: 0.4976
Epoch 2/10 — loss: 0.5805, acc: 0.7794
Epoch 3/10 — loss: 0.3367, acc: 0.8834
Epoch 4/10 — loss: 0.1884, acc: 0.9397
Epoch 5/10 — loss: 0.0936, acc: 0.9699
Epoch 6/10 — loss: 0.0317, acc: 0.9932
Epoch 7/10 — loss: 0.0318, acc: 0.9903
Epoch 8/10 — loss: 0.0082, acc: 1.0000
Epoch 9/10 — loss: 0.0032, acc: 1.0000
Epoch 10/10 — loss: 0.0023, acc: 1.0000

Test Accuracy: 0.7751937984496124

Classification Report:
              precision    recall  f1-score   support

    Negative       0.68      0.80      0.74        85
     Neutral       0.86      0.75      0.81        85
    Positive       0.81      0.77      0.79        88

    accuracy                           0.78       258
   macro avg       0.78      0.78      0.78       258
weighted avg       0.79      0.78      0.78       258


Confusion Matrix:
 [[68  6 11]
 [16 64  5]
 [16  4 68]]


### Traditional ML models

In [5]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

# LaBSE model
MODEL_NAME = "sentence-transformers/LaBSE"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
df = pd.read_excel("/home/santhosh/Data Scraping/scource code/TSA codes/TSAC - Telugu Sentiment Analysis Corpus.xlsx")[["Statement", "Label"]]
df = df[df["Label"].isin(["Positive", "Negative", "Neutral"])].dropna().reset_index(drop=True)

# Get embeddings
embeddings = []

with torch.no_grad():
    for text in tqdm(df["Statement"].tolist(), desc="Encoding with LaBSE"):
        encoded = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = output.pooler_output.squeeze(0).cpu().numpy()
        embeddings.append(cls_embedding)

# Convert to DataFrame
emb_df = pd.DataFrame(embeddings)
emb_df["Label"] = df["Label"]

# Save to CSV
emb_df.to_csv("/home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_labse_embeddings.csv", index=False)
print("✅ LaBSE embeddings saved.")


Encoding with LaBSE: 100%|██████████| 1287/1287 [00:05<00:00, 221.83it/s]


✅ LaBSE embeddings saved.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier


In [None]:

# Load saved embeddings
df = pd.read_csv("/home/santhosh/Data Scraping/scource code/TSA codes/TL_english_labse_embeddings.csv")

# Clean and prepare
df = df[df["Label"].isin(["Positive", "Negative", "Neutral"])].copy()
X = df.drop(columns=["Label"]).values
y_str = df["Label"].values
le = LabelEncoder()
y = le.fit_transform(y_str)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define classifiers and param grids
classifiers = {
    "SVM": (SVC(), {"C": [0.1, 1], "kernel": ["linear", "rbf"]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [None, 10]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {"n_estimators": [100, 200], "max_depth": [3, 6]}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {"C": [0.1, 1]}),
    "NaiveBayes": (GaussianNB(), {}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5]})
}

# Train & Evaluate
for name, (clf, param_grid) in classifiers.items():
    print(f"\n🔍 Classifier: {name}")
    if param_grid:
        grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        print(f"Best Params: {grid.best_params_}")
    else:
        clf.fit(X_train, y_train)
        best_model = clf

    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    print(report)



🔍 Classifier: SVM


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best Params: {'C': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

    Negative       0.62      0.79      0.69        85
     Neutral       0.87      0.68      0.76        85
    Positive       0.80      0.75      0.77        88

    accuracy                           0.74       258
   macro avg       0.76      0.74      0.74       258
weighted avg       0.76      0.74      0.74       258


🔍 Classifier: RandomForest
Best Params: {'max_depth': None, 'n_estimators': 200}
              precision    recall  f1-score   support

    Negative       0.59      0.73      0.65        85
     Neutral       0.81      0.68      0.74        85
    Positive       0.73      0.67      0.70        88

    accuracy                           0.69       258
   macro avg       0.71      0.69      0.70       258
weighted avg       0.71      0.69      0.70       258


🔍 Classifier: XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Params: {'max_depth': 3, 'n_estimators': 100}
              precision    recall  f1-score   support

    Negative       0.62      0.68      0.65        85
     Neutral       0.78      0.68      0.73        85
    Positive       0.71      0.73      0.72        88

    accuracy                           0.70       258
   macro avg       0.70      0.70      0.70       258
weighted avg       0.70      0.70      0.70       258


🔍 Classifier: LogisticRegression
Best Params: {'C': 0.1}
              precision    recall  f1-score   support

    Negative       0.66      0.67      0.66        85
     Neutral       0.76      0.73      0.74        85
    Positive       0.75      0.76      0.76        88

    accuracy                           0.72       258
   macro avg       0.72      0.72      0.72       258
weighted avg       0.72      0.72      0.72       258


🔍 Classifier: NaiveBayes
              precision    recall  f1-score   support

    Negative       0.60      0.73      0.66     

In [6]:

# Load saved embeddings
df = pd.read_csv("/home/santhosh/Data Scraping/scource code/TSA codes/TL_telugu_labse_embeddings.csv")

# Clean and prepare
df = df[df["Label"].isin(["Positive", "Negative", "Neutral"])].copy()
X = df.drop(columns=["Label"]).values
y_str = df["Label"].values
le = LabelEncoder()
y = le.fit_transform(y_str)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define classifiers and param grids
classifiers = {
    "SVM": (SVC(), {"C": [0.1, 1], "kernel": ["linear", "rbf"]}),
    "RandomForest": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [None, 10]}),
    "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {"n_estimators": [100, 200], "max_depth": [3, 6]}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {"C": [0.1, 1]}),
    "NaiveBayes": (GaussianNB(), {}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5]})
}

# Train & Evaluate
for name, (clf, param_grid) in classifiers.items():
    print(f"\n🔍 Classifier: {name}")
    if param_grid:
        grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        print(f"Best Params: {grid.best_params_}")
    else:
        clf.fit(X_train, y_train)
        best_model = clf

    y_pred = best_model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    print(report)



🔍 Classifier: SVM
Best Params: {'C': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

    Negative       0.66      0.81      0.73        85
     Neutral       0.92      0.71      0.80        85
    Positive       0.78      0.78      0.78        88

    accuracy                           0.77       258
   macro avg       0.79      0.77      0.77       258
weighted avg       0.79      0.77      0.77       258


🔍 Classifier: RandomForest
Best Params: {'max_depth': 10, 'n_estimators': 200}
              precision    recall  f1-score   support

    Negative       0.59      0.68      0.63        85
     Neutral       0.89      0.74      0.81        85
    Positive       0.69      0.69      0.69        88

    accuracy                           0.71       258
   macro avg       0.72      0.71      0.71       258
weighted avg       0.72      0.71      0.71       258


🔍 Classifier: XGBoost


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Params: {'max_depth': 3, 'n_estimators': 200}
              precision    recall  f1-score   support

    Negative       0.61      0.68      0.64        85
     Neutral       0.86      0.74      0.80        85
    Positive       0.69      0.70      0.70        88

    accuracy                           0.71       258
   macro avg       0.72      0.71      0.71       258
weighted avg       0.72      0.71      0.71       258


🔍 Classifier: LogisticRegression
Best Params: {'C': 0.1}
              precision    recall  f1-score   support

    Negative       0.61      0.71      0.65        85
     Neutral       0.82      0.73      0.77        85
    Positive       0.72      0.68      0.70        88

    accuracy                           0.71       258
   macro avg       0.71      0.71      0.71       258
weighted avg       0.72      0.71      0.71       258


🔍 Classifier: NaiveBayes
              precision    recall  f1-score   support

    Negative       0.58      0.66      0.62     