In [None]:
!pip install pandas numpy scikit-learn tensorflow nltk

In [None]:
import requests

url = "https://raw.githubusercontent.com/l3cube-pune/MarathiNLP/main/L3CubeMahaSent%20Dataset/tweets-train.csv"
response = requests.get(url)

# Split into lines and print the first 5
lines = response.text.splitlines()
for i, line in enumerate(lines[:5]):
    print(f"Line {i}: {line}")


In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/l3cube-pune/MarathiNLP/main/L3CubeMahaSent%20Dataset/tweets-train.csv"
df = pd.read_csv(url, encoding='ISO-8859-1')  # or 'latin1'

print(df.head())


In [None]:
import codecs

def try_decode(text):
    try:
        return codecs.decode(text.encode('latin1'), 'utf-8')
    except:
        return text

df['tweet'] = df['tweet'].apply(try_decode)
print(df['tweet'].head())


In [None]:
# Check the shape (rows, columns)
print("Dataset shape:", df.shape)

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Check label distribution
print("Label distribution:\n", df['label'].value_counts())

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize

# Download 'punkt_tab' for sentence tokenization:
nltk.download('punkt_tab')

# Download 'punkt' for word tokenization:
nltk.download('punkt', force=True)

# Verify downloads
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab')
    print("Punkt and Punkt_tab tokenizers are available.")
except LookupError:
    print("Tokenizers not found. Re-downloading...")
    nltk.download('punkt')
    nltk.download('punkt_tab')

# Load your dataset
url = "https://raw.githubusercontent.com/l3cube-pune/MarathiNLP/main/L3CubeMahaSent%20Dataset/tweets-train.csv"
df = pd.read_csv(url)

# Function to clean Marathi text
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep Marathi characters
    text = text.strip()
    return text

# Apply cleaning
df['cleaned_tweet'] = df['tweet'].apply(clean_text)

# Tokenize
df['tokens'] = df['cleaned_tweet'].apply(word_tokenize)

# Display a sample
print(df[['tweet', 'cleaned_tweet', 'tokens']].head())

In [None]:
!pip install git+https://github.com/anoopkunchukuttan/indic_nlp_library.git
!mkdir -p indic_nlp_resources/morph/morfessor
!wget -O indic_nlp_resources/morph/morfessor/mr.model https://raw.githubusercontent.com/anoopkunchukuttan/indic_nlp_resources/master/morph/morfessor/mr.model


In [None]:
# Install the required libraries
!pip install indic-nlp-library

# Clone the indic_nlp_library repository (this contains the actual code)
!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
!cd indic_nlp_library && pip install -e .

# Create directory for resources and download them
!mkdir -p indic_nlp_resources
!cd indic_nlp_resources && wget https://github.com/anoopkunchukuttan/indic_nlp_resources/archive/master.zip
!cd indic_nlp_resources && unzip -o master.zip  # -o flag to overwrite without prompting
!cd indic_nlp_resources && mv indic_nlp_resources-master/* .

# Add the library to Python path so it can be imported
import sys
import os

# Add the cloned repository directory to Python path
repo_path = os.path.join(os.getcwd(), "indic_nlp_library")
if repo_path not in sys.path:
    sys.path.insert(0, repo_path)

# Now try importing
try:
    from indicnlp import common
    from indicnlp.tokenize import indic_tokenize
    from indicnlp.morph import unsupervised_morph
    print("Successfully imported indicnlp modules!")
except ModuleNotFoundError as e:
    print(f"Import error: {e}")
    print("Trying alternative import approach...")

    # Check if the package is installed and list its location
    !pip show indic-nlp-library

    # List directories to confirm structure
    !ls -la
    !ls -la indic_nlp_library

    # Try installing directly from the repository again
    !pip install -e indic_nlp_library

    # Try importing again
    try:
        from indicnlp import common
        from indicnlp.tokenize import indic_tokenize
        from indicnlp.morph import unsupervised_morph
        print("Successfully imported indicnlp modules after reinstallation!")
    except Exception as e:
        print(f"Still having import issues: {e}")
        sys.exit(1)

# Set up the environment
RESOURCES_PATH = os.path.join(os.getcwd(), "indic_nlp_resources")
os.environ["INDIC_RESOURCES_PATH"] = RESOURCES_PATH
common.set_resources_path(RESOURCES_PATH)

# Ensure morfessor models directory exists
morph_dir = os.path.join(RESOURCES_PATH, "morph", "morfessor")
!mkdir -p {morph_dir}

# Download Marathi morfessor model if it doesn't exist
model_path = os.path.join(morph_dir, "mr.model")
if not os.path.exists(model_path):
    print("Downloading Marathi morfessor model...")
    !wget -O {model_path} https://raw.githubusercontent.com/anoopkunchukuttan/indic_nlp_resources/master/morph/morfessor/mr.model

# Verify the model file exists
!ls -la {model_path}

# Install morfessor if not already installed
!pip install morfessor

# Initialize the unsupervised morph analyzer for Marathi
print("Initializing morphological analyzer...")
try:
    morph_analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('mr')
    print("Morphological analyzer initialized successfully!")
except Exception as e:
    print(f"Error initializing morphological analyzer: {e}")
    # Show debug info
    print(f"Model path: {model_path}")
    print(f"File exists: {os.path.exists(model_path)}")
    print(f"File size: {os.path.getsize(model_path) if os.path.exists(model_path) else 'N/A'}")
    print(f"INDIC_RESOURCES_PATH: {os.environ.get('INDIC_RESOURCES_PATH')}")
    raise

# Function to get root words
def get_root_words(text):
    """
    Extract root words from Marathi text using morphological analysis.

    Args:
        text (str): Input Marathi text

    Returns:
        list: List of root words
    """
    words = indic_tokenize.trivial_tokenize(text)
    root_words = []
    for word in words:
        try:
            analysis = morph_analyzer.morph_analyze(word)
            if analysis and len(analysis) > 0:
                # Get the root form (first element is usually the root)
                root_words.append(analysis[0][0])
            else:
                root_words.append(word)
        except Exception as e:
            print(f"Error analyzing word '{word}': {str(e)}")
            root_words.append(word)
    return root_words


In [None]:
example_text = "मी सकाळी लवकर उठलो आणि पुस्तक वाचले."
print("\nExample text:", example_text)
print("Tokenized:", indic_tokenize.trivial_tokenize(example_text))
print("Root words:", get_root_words(example_text))

print("\nSetup complete! You can now use the get_root_words function on your dataframe.")

In [None]:
df['root_tokens'] = df['cleaned_tweet'].apply(get_root_words)

In [None]:
# Install fastText (run only if not already installed)
!pip install fasttext -q

# Import necessary libraries
import fasttext
import os
import pandas as pd

# Verify that 'df' exists and has 'cleaned_tweet' column
if 'df' not in globals():
    raise NameError("DataFrame 'df' is not defined in the current environment. Please ensure it is loaded.")
if 'cleaned_tweet' not in df.columns:
    raise KeyError("Column 'cleaned_tweet' not found in DataFrame 'df'. Please ensure it exists.")

# Function to prepare data for fastText training
def prepare_fasttext_data(df, output_file='marathi_corpus.txt'):
    with open(output_file, 'w', encoding='utf-8') as f:
        for text in df['cleaned_tweet']:
            f.write(text + '\n')
    return output_file

# Generate corpus file
corpus_file = prepare_fasttext_data(df)

# Train fastText model on Marathi corpus
model_file = 'marathi_fasttext.bin'
model = fasttext.train_unsupervised(
    corpus_file,
    model='skipgram',
    dim=100,  # Vector dimension
    epoch=20,  # Number of epochs
    minCount=5,  # Minimum word frequency
    thread=4  # Number of threads for faster training
)

# Save the model
model.save_model(model_file)

# Function to get word vector
def get_word_vector(word, model):
    return model.get_word_vector(word)

# Test the model with an example word
example_word = "खूप"  # "Very" in Marathi
vector = get_word_vector(example_word, model)
print(f"Vector for '{example_word}': {vector[:10]}... (first 10 dimensions)")
print(f"Vector length: {len(vector)}")

In [None]:
!pip uninstall -y torch torchvision torchaudio transformers
!pip install torch==2.3.0 transformers==4.41.2

In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4  # Compatible with transformers and torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ✅ Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/l3cube-pune/MarathiNLP/main/L3CubeMahaSent%20Dataset/tweets-train.csv")

# ✅ Load MahaMarathi BERT model
model_name = "l3cube-pune/marathi-bert-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)


# Batch-wise detailed sentiment analysis function
def detailed_sentiment_batch(texts, batch_size=10):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        # Tokenize and move to device
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        with torch.no_grad():
            outputs = sentiment_model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predictions = torch.argmax(probs, dim=1)

        for j in range(len(batch)):
            sentiment_score = predictions[j].item()
            confidence_scores = probs[j].tolist()
            results.append({
                'sentiment': sentiment_score - 1,  # Convert to -1 (neg), 0 (neutral), 1 (positive)
                'confidence': confidence_scores,
                'intensity': abs(sentiment_score - 1),
                'is_mixed': max(confidence_scores) < 0.6
            })

    return results


# ✅ Apply sentiment analysis
df['detailed_sentiment'] = detailed_sentiment_batch(df['tweet'].tolist(), batch_size=10)


In [None]:
# Show first 5 predictions
df[['tweet', 'detailed_sentiment']].head()


In [None]:
df['sentiment_label'] = df['detailed_sentiment'].apply(lambda x: x['sentiment'])
df['sentiment_label'].value_counts()


In [None]:
print(df.columns)


In [None]:
test_tweet = "हा अनुभव खूप छान होता!"
result = detailed_sentiment_batch([test_tweet])  # Pass as list
print("Tweet:", test_tweet)
print("Result:", result[0])  # Since result is a list of dicts


In [None]:
print(df['label'].value_counts())


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Path where the model was saved in Google Drive
model_path = '/content/drive/MyDrive/marathi_sentiment_finetuned'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model.to(device)

print("Fine-tuned model loaded successfully!")

In [None]:
!pip install --upgrade peft

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# Define a compute_metrics function to calculate accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted class (0, 1, 2)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Assuming trainer is still in scope from your fine-tuning step
# If not, reinitialize it with the fine-tuned model

# Define training arguments (minimal setup for evaluation)
training_args = TrainingArguments(
    output_dir='./marathi_sentiment_model',  # Replace with your desired output directory
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Add the metrics function
)

# Evaluate the model on the validation set
eval_results = trainer.evaluate()

# Print the results
print("Evaluation Results:")
print(f"Validation Loss: {eval_results['eval_loss']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, classification_report
import re

# Load the fine-tuned model and tokenizer
model_path = '/content/drive/MyDrive/marathi_sentiment_finetuned'  # Adjusted path to Google Drive
tokenizer = AutoTokenizer.from_pretrained(model_path)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model.to(device)

# Load the test dataset
test_url = "https://raw.githubusercontent.com/l3cube-pune/MarathiNLP/main/L3Cube-MahaSent-MD/MahaSent_All/MahaSent_All_Test.csv"
test_df = pd.read_csv(test_url)

# Function to clean Marathi text (consistent with your training)
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep Marathi characters
    text = text.strip()
    return text

# Check the column names in the test_df
print(test_df.columns)
# Assuming the tweet column is named "text" based on the file structure of MahaSent_All_Test.csv
# Apply cleaning to the correct column (e.g., 'text' instead of 'tweet')
test_df['cleaned_tweet'] = test_df['text'].apply(clean_text)

# Prepare test data
test_texts = test_df['cleaned_tweet'].tolist()
test_labels = test_df['label'].tolist()


# Convert labels to [0, 1, 2] format (same as training)
test_labels = [l + 1 for l in test_labels]

# Custom dataset class (from your notebook)
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create test dataset
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

# Define compute_metrics for accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Predicted class (0, 1, 2)
    accuracy = accuracy_score(labels, predictions)
    # Convert back to [-1, 0, 1] for detailed report
    predictions_adjusted = [p - 1 for p in predictions]
    labels_adjusted = [l - 1 for l in labels]
    report = classification_report(labels_adjusted, predictions_adjusted, target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)'], output_dict=True)
    return {
        "accuracy": accuracy,
        "classification_report": report
    }

# Training arguments (minimal setup for evaluation)
training_args = TrainingArguments(
    output_dir='./marathi_sentiment_model',  # Reuse from training
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer for evaluation
trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Evaluate on the test set
eval_results = trainer.evaluate()

# Print results
print("Test Set Evaluation Results:")
print(f"Test Loss: {eval_results['eval_loss']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")
print("\nClassification Report:")
for label, metrics in eval_results['eval_classification_report'].items():
    if label in ['Negative (-1)', 'Neutral (0)', 'Positive (1)']:
        print(f"{label}: Precision: {metrics['precision']:.2f}, Recall: {metrics['recall']:.2f}, F1-Score: {metrics['f1-score']:.2f}")

In [None]:
test_tweet = "हा अनुभव खूप छान होता!"
result = detailed_sentiment_batch([test_tweet])  # Pass as list
print("Tweet:", test_tweet)
print("Result:", result[0])  # Since result is a list of dicts


In [None]:
predictions = trainer.predict(test_dataset).predictions.argmax(-1)
misclassified = [(text, pred - 1, true - 1) for text, pred, true in zip(test_texts, predictions, test_labels) if pred != true]
print("Misclassified Examples:")
for text, pred, true in misclassified[:5]:
    print(f"Tweet: {text}, Predicted: {pred}, True: {true}")

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, classification_report
import re

# Load the fine-tuned model and tokenizer
model_path = './marathi_sentiment_finetuned'  # Adjust if saved elsewhere (e.g., Google Drive)
tokenizer = AutoTokenizer.from_pretrained(model_path)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model.to(device)

# Load the test dataset
test_url = "https://raw.githubusercontent.com/l3cube-pune/MarathiNLP/main/L3Cube-MahaSent-MD/MahaSent_All/MahaSent_All_Test.csv"
test_df = pd.read_csv(test_url)

# Function to clean Marathi text (consistent with your training)
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  # Keep Marathi characters
    text = text.strip()
    return text

# Check the column names in the test_df
print(test_df.columns)
# Assuming the tweet column is named "text" based on the file structure of MahaSent_All_Test.csv
# Apply cleaning to the correct column (e.g., 'text' instead of 'tweet')
test_df['cleaned_tweet'] = test_df['text'].apply(clean_text)

# Prepare test data
test_texts = test_df['cleaned_tweet'].tolist()
test_labels = test_df['label'].tolist()


# Convert labels to [0, 1, 2] format (same as training)
test_labels = [l + 1 for l in test_labels]

# Custom dataset class (from your notebook)
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create test dataset
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

# Define compute_metrics for accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Predicted class (0, 1, 2)
    accuracy = accuracy_score(labels, predictions)
    # Convert back to [-1, 0, 1] for detailed report
    predictions_adjusted = [p - 1 for p in predictions]
    labels_adjusted = [l - 1 for l in labels]
    report = classification_report(labels_adjusted, predictions_adjusted, target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)'], output_dict=True)
    return {
        "accuracy": accuracy,
        "classification_report": report
    }

# Training arguments (minimal setup for evaluation)
training_args = TrainingArguments(
    output_dir='./marathi_sentiment_model',  # Reuse from training
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

# Initialize trainer for evaluation
trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Evaluate on the test set
eval_results = trainer.evaluate()

# Print results
print("Test Set Evaluation Results:")
print(f"Test Loss: {eval_results['eval_loss']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")
print("\nClassification Report:")
for label, metrics in eval_results['eval_classification_report'].items():
    if label in ['Negative (-1)', 'Neutral (0)', 'Positive (1)']:
        print(f"{label}: Precision: {metrics['precision']:.2f}, Recall: {metrics['recall']:.2f}, F1-Score: {metrics['f1-score']:.2f}")

In [None]:
test_tweets = [
    "हा अनुभव खूप छान होता!",  # Positive
    "माझा दिवस आज खूपच खराब गेला.",  # Negative
    "हे ठिकाण अगदी ठिकठाक आहे.",  # Neutral
    "तुमची सेवा खूपच चांगली आहे!",  # Positive
    "हा चित्रपट वेळ वाया घालवणारा आहे.",  # Negative
    "मी अजून निर्णय घेतलेला नाही.",  # Neutral
    "माझं मन खूप आनंदी आहे!",  # Positive
    "त्यांचं वागणं अजिबातच आवडलं नाही.",  # Negative
    "आजचं हवामान साधारण आहे.",  # Neutral
    "अशा अनुभवासाठी मी पुन्हा यायला तयार आहे!",  # Positive
    "हे सॉफ्टवेअर खूप स्लो आहे.",  # Negative
    "ते एक सामान्य उत्तर होतं.",  # Neutral
    "मी तुमच्या सेवेने खूप प्रभावित झालो.",  # Positive
    "खरंच वाईट अनुभव होता.",  # Negative
    "काही खास नाही, नेहमीसारखंच होतं."  # Neutral
]

results = detailed_sentiment_batch(test_tweets)

for tweet, result in zip(test_tweets, results):
    print("Tweet:", tweet)
    print("Result:", result)
    print("-" * 40)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Compare predictions of different models
def plot_confusion_matrices(y_true, y_pred_fasttext, y_pred_llm, y_pred_ensemble):
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

    # FastText confusion matrix
    sns.heatmap(confusion_matrix(y_true, y_pred_fasttext), annot=True, fmt='d', ax=ax1)
    ax1.set_title('FastText Model')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('True')

    # LLM confusion matrix
    sns.heatmap(confusion_matrix(y_true, y_pred_llm), annot=True, fmt='d', ax=ax2)
    ax2.set_title('MahaMarathi LLM')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('True')

    # Ensemble confusion matrix
    sns.heatmap(confusion_matrix(y_true, y_pred_ensemble), annot=True, fmt='d', ax=ax3)
    ax3.set_title('Ensemble Model')
    ax3.set_xlabel('Predicted')
    ax3.set_ylabel('True')

    plt.tight_layout()
    plt.show()

# Analyze performance by sentiment category
def plot_performance_by_category():
    categories = ['Negative', 'Neutral', 'Positive']

    metrics = {'Precision': [], 'Recall': [], 'F1-Score': []}

    for i, category in enumerate(categories):
        report = classification_report(
            df['label'] == i-1,

            output_dict=True
        )
        metrics['Precision'].append(report['True']['precision'])
        metrics['Recall'].append(report['True']['recall'])
        metrics['F1-Score'].append(report['True']['f1-score'])

    plt.figure(figsize=(10, 6))
    x = np.arange(len(categories))
    width = 0.25

    plt.bar(x - width, metrics['Precision'], width, label='Precision')
    plt.bar(x, metrics['Recall'], width, label='Recall')
    plt.bar(x + width, metrics['F1-Score'], width, label='F1-Score')

    plt.xlabel('Sentiment Category')
    plt.ylabel('Score')
    plt.title('Performance by Sentiment Category')
    plt.xticks(x, categories)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Execute the visualization
plot_performance_by_category()

In [None]:
# Replace 'ensemble_predictions' with the output of your ensemble model prediction
# This is a placeholder and needs to be filled with your actual ensemble model logic
ensemble_predictions = df['sentiment_label'].tolist()
df['final_prediction'] = ensemble_predictions

In [None]:
!pip uninstall -y opencv-python opencv-python-headless
!pip install torch==2.3.0 transformers==4.41.2 numpy==1.26.4

In [None]:
!pip uninstall -y transformers peft
!pip install transformers==4.41.2
!pip install peft==0.17.1

from transformers import Trainer
from sklearn.metrics import accuracy_score

# Define a compute_metrics function to calculate accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted class (0, 1, 2)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Assuming trainer is still in scope from your fine-tuning step
# If not, reinitialize it with the fine-tuned model
trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Add the metrics function
)

# Evaluate the model on the validation set
eval_results = trainer.evaluate()

# Print the results
print("Evaluation Results:")
print(f"Validation Loss: {eval_results['eval_loss']:.4f}")
print(f"Accuracy: {eval_results['eval_accuracy'] * 100:.2f}%")

In [None]:
!pip check

In [None]:
!pip uninstall -y thinc
!pip install --upgrade --force-reinstall transformers==4.41.2
!pip install --upgrade --force-reinstall peft==0.17.1