In [None]:
!pip install gcsfs

In [None]:
!pip install pandas openpyxl emoji transformers torch datasets scikit-learn fsspec==2025.3.0 gcsfs==2025.3.0

In [None]:
#one
# Install required libraries
#!pip install pandas openpyxl emoji transformers torch datasets scikit-learn

In [None]:
#two
!git clone https://github.com/SenticNet/stress-detection

In [None]:
#three
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
#BERT on Reddit_Title.xlsx
import pandas as pd
import re
import emoji
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score


# Load datasets
df_title = pd.read_excel("/content/stress-detection/Reddit_Title.xlsx")
df_combi = pd.read_excel("/content/stress-detection/Reddit_Combi.xlsx")

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)  # Convert emojis to text
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df_title["text"] = df_title["title"].apply(preprocess_text)
df_combi["text"] = df_combi["Body_Title"].apply(preprocess_text)

# Rename labels (assuming dataset has "label" column where 1 = Stress, 0 = No Stress)
df_title = df_title.rename(columns={"label": "labels"})
df_combi = df_combi.rename(columns={"label": "labels"})

# Select PLM model: Choose "bert-base-uncased", "distilbert-base-uncased", or "roberta-base"
MODEL_NAME = "bert-base-uncased"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert Pandas to Hugging Face Dataset
dataset_title = Dataset.from_pandas(df_title[["text", "labels"]])
dataset_combi = Dataset.from_pandas(df_combi[["text", "labels"]])

# Tokenize data
dataset_title = dataset_title.map(tokenize_function, batched=True)
dataset_combi = dataset_combi.map(tokenize_function, batched=True)

# Split into train & test sets (80% train, 20% test)
dataset_title = dataset_title.train_test_split(test_size=0.2)
dataset_combi = dataset_combi.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_title["train"],
    eval_dataset=dataset_title["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
#four
#RoBERT on Reddit_Title.xlsx
import pandas as pd
import re
import emoji
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score


# Load datasets
df_title = pd.read_excel("/content/stress-detection/Reddit_Title.xlsx")
df_combi = pd.read_excel("/content/stress-detection/Reddit_Combi.xlsx")

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)  # Convert emojis to text
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df_title["text"] = df_title["title"].apply(preprocess_text)
df_combi["text"] = df_combi["Body_Title"].apply(preprocess_text)

# Rename labels (assuming dataset has "label" column where 1 = Stress, 0 = No Stress)
df_title = df_title.rename(columns={"label": "labels"})
df_combi = df_combi.rename(columns={"label": "labels"})

# Select PLM model: Choose "bert-base-uncased", "distilbert-base-uncased", or "roberta-base"
MODEL_NAME = "roberta-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert Pandas to Hugging Face Dataset
dataset_title = Dataset.from_pandas(df_title[["text", "labels"]])
dataset_combi = Dataset.from_pandas(df_combi[["text", "labels"]])

# Tokenize data
dataset_title = dataset_title.map(tokenize_function, batched=True)
dataset_combi = dataset_combi.map(tokenize_function, batched=True)

# Split into train & test sets (80% train, 20% test)
dataset_title = dataset_title.train_test_split(test_size=0.2)
dataset_combi = dataset_combi.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_title["train"],
    eval_dataset=dataset_title["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Get predictions
predictions = trainer.predict(dataset_title['test'])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
#five
!pip install joblib
import joblib

# Save model and tokenizer
MODEL_SAVE_PATH = "./roberta_stress_model.pkl"
TOKENIZER_SAVE_PATH = "./roberta_tokenizer.pkl"

joblib.dump(model, MODEL_SAVE_PATH)
joblib.dump(tokenizer, TOKENIZER_SAVE_PATH)

print("Model and tokenizer saved successfully!")

In [None]:
#six
!pip install streamlit==1.41.1 pyngrok

In [None]:
'''import streamlit as st
import torch
import joblib
from transformers import AutoTokenizer
import re
import emoji
import time

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer (Ensure correct path in Google Drive or upload manually)
MODEL_PATH = "./roberta_stress_model.pkl"
TOKENIZER_PATH = "./roberta_tokenizer.pkl"

try:
    model = joblib.load(MODEL_PATH).to(device)
    tokenizer = joblib.load(TOKENIZER_PATH)
except:
    st.error("Model or tokenizer not found! Please upload them.")
    st.stop()

# Preprocessing function
def preprocess_text(text):
    text = emoji.demojize(text)
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.strip()
    return text

# Prediction function
def predict_stress(text):
    processed_text = preprocess_text(text)
    inputs = tokenizer(processed_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    stress_score = probabilities[0][1].item()
    return stress_score

# Streamlit UI
st.set_page_config(page_title="Stress Detection", layout="centered")
st.markdown("""
    <h1 style='text-align: center; color: #ff4b4b;'>🧠 Stress Detection from Reddit Titles</h1>
    <p style='text-align: center;'>Enter a Reddit post title to check if it indicates stress.</p>
    <hr>
""", unsafe_allow_html=True)

user_input = st.text_area("Enter Reddit Title:", "", help="Type a Reddit post title here...")

if st.button("Analyze", use_container_width=True):
    if user_input:
        with st.spinner("Analyzing..."):
            time.sleep(1)  # Simulating processing time
            stress_score = predict_stress(user_input)
            label = "😨 Stressed" if stress_score > 0.5 else "😊 Not Stressed"

            st.subheader(f"Prediction: {label}")
            st.progress(stress_score)
            st.write(f"Confidence Score: {stress_score:.4f}")
    else:
        st.warning("⚠️ Please enter a Reddit title.")

st.markdown("<p style='text-align: center; color: gray;'>Powered by <b>RoBERTa</b> for stress detection.</p>", unsafe_allow_html=True)
'''

In [None]:
#seven
#replace the [authtoken]
!ngrok config add-authtoken [authtoken]

In [None]:
#eight
from pyngrok import ngrok

# Define the new port for the Streamlit app
new_port = 8508

# Run the Streamlit app on the new port
!streamlit run appr.py --server.port {new_port} &>/content/logs.txt &

# Ensure all previous tunnels are closed before opening a new one
ngrok.kill()

# Start a new ngrok tunnel on the new port
public_url = ngrok.connect(new_port, "http")
print(f"Public URL: {public_url}")

In [None]:
# Ensure all previous tunnels are closed before opening a new one
ngrok.kill()

In [None]:
import torch
import re
import emoji

# Function to preprocess text (same as used during training)
def preprocess_text(text):
    text = emoji.demojize(text)  # Convert emojis to text
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = text.strip()
    return text

# Function to predict stress from user input
def predict_stress(text):
    text = preprocess_text(text)

    # Tokenize input text
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Move input to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model prediction
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

    # Return result
    return "Stress Detected 😟" if prediction == 1 else "No Stress Detected 😊"

# Continuous loop to take input from the user
while True:
    user_input = input("Enter a sentence (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Exiting stress detection system.")
        break
    result = predict_stress(user_input)
    print(f"Prediction: {result}\n")


In [None]:
#RoBERT on Reddit_Combi.xlsx
import pandas as pd
import re
import emoji
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score


# Load datasets
df_title = pd.read_excel("/content/stress-detection/Reddit_Title.xlsx")
df_combi = pd.read_excel("/content/stress-detection/Reddit_Combi.xlsx")

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)  # Convert emojis to text
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df_title["text"] = df_title["title"].apply(preprocess_text)
df_combi["text"] = df_combi["Body_Title"].apply(preprocess_text)

# Rename labels (assuming dataset has "label" column where 1 = Stress, 0 = No Stress)
df_title = df_title.rename(columns={"label": "labels"})
df_combi = df_combi.rename(columns={"label": "labels"})

# Select PLM model: Choose "bert-base-uncased", "distilbert-base-uncased", or "roberta-base"
MODEL_NAME = "roberta-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert Pandas to Hugging Face Dataset
dataset_title = Dataset.from_pandas(df_title[["text", "labels"]])
dataset_combi = Dataset.from_pandas(df_combi[["text", "labels"]])

# Tokenize data
dataset_title = dataset_title.map(tokenize_function, batched=True)
dataset_combi = dataset_combi.map(tokenize_function, batched=True)

# Split into train & test sets (80% train, 20% test)
dataset_title = dataset_title.train_test_split(test_size=0.2)
dataset_combi = dataset_combi.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_combi["train"],
    eval_dataset=dataset_combi["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
#distilbert on Reddit_Title.xlsx
import pandas as pd
import re
import emoji
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score


# Load datasets
df_title = pd.read_excel("/content/stress-detection/Reddit_Title.xlsx")
df_combi = pd.read_excel("/content/stress-detection/Reddit_Combi.xlsx")

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)  # Convert emojis to text
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df_title["text"] = df_title["title"].apply(preprocess_text)
df_combi["text"] = df_combi["Body_Title"].apply(preprocess_text)

# Rename labels (assuming dataset has "label" column where 1 = Stress, 0 = No Stress)
df_title = df_title.rename(columns={"label": "labels"})
df_combi = df_combi.rename(columns={"label": "labels"})

# Select PLM model: Choose "bert-base-uncased", "distilbert-base-uncased", or "roberta-base"
MODEL_NAME = "distilbert-base-uncased"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert Pandas to Hugging Face Dataset
dataset_title = Dataset.from_pandas(df_title[["text", "labels"]])
dataset_combi = Dataset.from_pandas(df_combi[["text", "labels"]])

# Tokenize data
dataset_title = dataset_title.map(tokenize_function, batched=True)
dataset_combi = dataset_combi.map(tokenize_function, batched=True)

# Split into train & test sets (80% train, 20% test)
dataset_title = dataset_title.train_test_split(test_size=0.2)
dataset_combi = dataset_combi.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_title["train"],
    eval_dataset=dataset_title["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
#distilbert on Reddit_Combi.xlsx
import pandas as pd
import re
import emoji
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score


# Load datasets
df_title = pd.read_excel("/content/stress-detection/Reddit_Title.xlsx")
df_combi = pd.read_excel("/content/stress-detection/Reddit_Combi.xlsx")

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)  # Convert emojis to text
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df_title["text"] = df_title["title"].apply(preprocess_text)
df_combi["text"] = df_combi["Body_Title"].apply(preprocess_text)

# Rename labels (assuming dataset has "label" column where 1 = Stress, 0 = No Stress)
df_title = df_title.rename(columns={"label": "labels"})
df_combi = df_combi.rename(columns={"label": "labels"})

# Select PLM model: Choose "bert-base-uncased", "distilbert-base-uncased", or "roberta-base"
MODEL_NAME = "distilbert-base-uncased"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert Pandas to Hugging Face Dataset
dataset_title = Dataset.from_pandas(df_title[["text", "labels"]])
dataset_combi = Dataset.from_pandas(df_combi[["text", "labels"]])

# Tokenize data
dataset_title = dataset_title.map(tokenize_function, batched=True)
dataset_combi = dataset_combi.map(tokenize_function, batched=True)

# Split into train & test sets (80% train, 20% test)
dataset_title = dataset_title.train_test_split(test_size=0.2)
dataset_combi = dataset_combi.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return {"accuracy": acc, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_combi["train"],
    eval_dataset=dataset_combi["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

In [None]:
#EMOTION CLASSIFICATION

In [None]:
import pandas as pd
import re
import emoji
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score

# Load dataset
df_combi = pd.read_excel("/content/stress-detection/Reddit_Combi.xlsx")

# Print column names to verify structure
print("Dataset Columns:", df_combi.columns)

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        text = emoji.demojize(text)  # Convert emojis to text
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
        text = text.strip()  # Remove extra spaces
    return text

# Apply preprocessing
df_combi["text"] = df_combi["Body_Title"].apply(preprocess_text)

# Emotion label mapping (Modify as needed)
emotion_labels = {
    0: "joy",
    1: "sadness",
    2: "anger",
    3: "fear",
    4: "neutral"
}

# Check if "label" column exists
if "label" not in df_combi.columns:
    raise KeyError("The 'label' column is missing. Check your dataset structure.")

# Map labels to numerical values
df_combi["labels"] = df_combi["label"]

# Drop rows with missing labels
df_combi = df_combi.dropna(subset=["labels"]).reset_index(drop=True)

# Convert labels to integers
df_combi["labels"] = df_combi["labels"].astype(int)

# Select Pretrained Model
MODEL_NAME = "roberta-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(emotion_labels))

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Convert Pandas DataFrame to Hugging Face Dataset
dataset_combi = Dataset.from_pandas(df_combi[["text", "labels"]])

# Tokenize data
dataset_combi = dataset_combi.map(tokenize_function, batched=True)

# Remove the "text" column (but NOT "__index_level_0__" since it doesn't exist)
dataset_combi = dataset_combi.remove_columns(["text"])


# Split dataset into training (80%) and testing (20%)
dataset_combi = dataset_combi.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./emotion_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)

# Define evaluation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_combi["train"],
    eval_dataset=dataset_combi["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Metrics
accuracy = [0.964, 0.964, 0.972]
precision = [0.964, 0.965, 0.972]
recall = [0.963, 0.964, 0.969]
f1_score = [0.964, 0.964, 0.971]

# Model names
models = ["BERT", "DistilBERT", "RoBERTa"]
metrics = [accuracy, precision, recall, f1_score]
titles = ["Accuracy", "Precision", "Recall", "F1 Score"]

# Bar width and color palette
bar_width = 0.5
colors = ["#4C72B0", "#55A868", "#C44E52"]

# Create figure
fig, axs = plt.subplots(4, 1, figsize=(8, 16))
plt.suptitle("Performance Metrics of BERT, DistilBERT, and RoBERTa", fontsize=14, fontweight='bold')

# Function to create bar plots with labels
def create_bar_plot(ax, data, title, color):
    sns.barplot(x=models, y=data, ax=ax, hue=models, palette=color, dodge=False)  # Updated line
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_ylim(0.95, 1.0)
    ax.grid(axis='y', linestyle="--", alpha=0.7)

    # (Rest of your function code remains the same)

    # Add value labels
    for i, v in enumerate(data):
        ax.text(i, v + 0.002, f"{v:.3f}", ha='center', fontsize=10, fontweight='bold')

for i, ax in enumerate(axs):
    ax.bar(models, metrics[i], color=['blue', 'green', 'red'])
    ax.set_ylim(0.95, 1.00)
    ax.set_title(titles[i], fontsize=14, fontweight='bold')

    # Display values on bars
    for j, value in enumerate(metrics[i]):
        ax.text(j, value + 0.002, f"{value:.3f}", ha='center', fontsize=12, fontweight='bold')

# Adjust layout and show plot
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
