In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%capture
!pip install -q bitsandbytes
!pip install -q accelerate
!pip install -q peft
!pip install -q --upgrade transformers

In [None]:
import torch
import pandas as pd
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

In [None]:
file_path = "/kaggle/input/multi-lingual-sentiment-analysis/train.csv"

df = pd.read_csv(file_path)
df = pd.DataFrame(df)

df.head()

In [None]:
train_df, val_df = train_test_split(df, test_size = 0.2, random_state = 42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print(f"Training dataset: {train_dataset.shape}")
print(f"Validation datset: {val_dataset.shape}")

In [None]:
train_df.head()

In [None]:
import matplotlib.pyplot as plt

def plot_sentiment(df, val_df):

    train_counts = df['label'].value_counts()
    val_counts = val_df['label'].value_counts()
    fig, ax = plt.subplots(figsize=(10,6))
    x = range(len(train_counts))
    width = 0.25
    ax.bar([i- width for i in x], train_counts.values, width, label = 'Train', alpha = 0.8)
    ax.bar([i+ width for i in x], val_counts.values, width, label='Validation', alpha = 0.8)
    ax.set_ylabel('Count')
    ax.set_title('Sentiment distribution')
    ax.set_xticks(x)
    ax.set_xticklabels(train_counts.index)
    ax.legend()

    for i, v in enumerate(train_counts.values):
        ax.text(i - width, v, str(v), ha='center', va='bottom')
    for i, v in enumerate(val_counts.values):
        ax.text(i + width, v, str(v), ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

# Call the function to plot
plot_sentiment(df, val_df)
    

In [None]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix

def evaluate_binary_sentiment(y_true, y_pred):
    """
    Evaluate binary sentiment classification performance using F1 score.
    
    Parameters:
    y_true (array-like): Ground truth labels ('positive' or 'negative')
    y_pred (array-like): Predicted labels ('positive' or 'negative')
    
    Returns:
    None (prints evaluation metrics)
    """
    # Define mapping for binary sentiment
    mapping = {'positive': 1, 'negative': 0}
    
    # Convert string labels to numeric
    def map_func(x):
        return mapping.get(x.lower())  # default to negative if unknown label
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate overall F1 score
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    print(f'Overall F1 Score: {f1:.3f}')
    
    # Calculate F1 scores for each class
    f1_pos = f1_score(y_true=y_true, y_pred=y_pred, pos_label=1)
    f1_neg = f1_score(y_true=y_true, y_pred=y_pred, pos_label=0)
    print(f'F1 Score for positive sentiment: {f1_pos:.3f}')
    print(f'F1 Score for negative sentiment: {f1_neg:.3f}')
    
    # Generate classification report
    print('\nClassification Report:')
    print(classification_report(y_true=y_true, y_pred=y_pred, 
                              target_names=['negative', 'positive']))
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    print('\nConfusion Matrix:')
    print('                 Predicted Negative  Predicted Positive')
    print(f'Actual Negative      {conf_matrix[0][0]:<18d}{conf_matrix[0][1]}')
    print(f'Actual Positive      {conf_matrix[1][0]:<18d}{conf_matrix[1][1]}')

In [None]:
import torch
model_path = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"


# Quantization configuration

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Loading the model and tokenizer

model = AutoModelForCausalLM.from_pretrained(model_path,quantization_config=bnb_config,
                                             device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    model_max_length=1024,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
map_dict = {
    "as": "Assamese",
    "bd": "Bodo",
    "bn": "Bengali",
    "gu": "Gujarati",
    "hi": "Hindi",
    "kn": "Kannada",
    "ml": "Malayalam",
    "mr": "Marathi",
    "or": "Odia",
    "pa": "Punjabi",
    "ta": "Tamil",
    "te": "Telugu",
    "ur": "Urdu"
}

In [None]:
import torch
def generate_response(prompt,  max_length=200):
    # Tokenize input prompt and move to the correct device
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate text
    with torch.no_grad():
        output = model.generate(**inputs)

    # Decode and return generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
lang = 'bn'
beng_sen = list(df[df["language"] == lang]['sentence'])
print(beng_sen[8])

prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant that analyzes sentiment in text.<|eot_id|><|start_header_id|>user<|end_header_id|>
Analyze the sentiment of the following {lang} text and respond with exactly one word (either 'positive' or 'negative'):
{beng_sen[8]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

print(generate_response(prompt))

In [None]:
from transformers import pipeline
from tqdm import tqdm
import re
def predict_sentiment(test_dataset, model, tokenizer):
    """
    Predicts sentiments for multilingual text data.
    
    Args:
        test_dataset: Dataset containing the test examples
        model: Fine-tuned model
        tokenizer: Associated tokenizer
    
    Returns:
        list: Predicted sentiments ('positive' or 'negative')
    """
    y_pred = []
    
    # Create pipeline once outside the loop for efficiency
    """pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=5,
        temperature=0.1,
        pad_token_id=tokenizer.eos_token_id
    )
    """

    

    

    
    for index,example in test_dataset.iterrows():
        # Format prompt similar to training data
        #language_name = map_dict.get(example['language'], example['language'])
        #prompt = f"Analyze the sentiment of the following {language_name} text:\nText: {example['sentence']}\nSentiment:"
        
        """
        prompt = f<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a helpful assistant that analyzes sentiment in text.<|eot_id|><|start_header_id|>user<|end_header_id|>
        Analyze the sentiment of the following {language_name} text and respond with exactly one word (either 'positive' or 'negative'):
        {example['sentence']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """
        #print(example['sentence'])
        prompt = f"""
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        
        You are a helpful AI assistant for sentiment analysis(POSITIVE or NEGATIVE)<|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        
        Predict the sentiment of this sentence: {example['sentence']}
        Output Format: POSITIVE or NEGATIVE
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """
        
        # Generate prediction
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        #result = pipe(prompt)[0]['generated_text']
        # Generate text
        with torch.no_grad():
            output = model.generate(**inputs)
        result= tokenizer.decode(output[0], skip_special_tokens=True)
        # Extract the predicted sentiment from the generated text
        # Look for the sentiment after the last occurrence of "Sentiment:"
        
        sentiment_part = result.split("\n")[-1].strip()
        print(sentiment_part)
        
        """
        
        if "positive" in sentiment_part.lower():
            y_pred.append("positive")
        elif "negative" in sentiment_part.lower():
            y_pred.append("negative")
        else:
            y_pred.append("negative")  # Default to negative for unrecognized outputs
        """
        #match = re.search(r'\b(POSITIVE|NEGATIVE)\b', sentiment_part[0], re.IGNORECASE)
        #out =  match.group(0).upper() if match else "POSITIVE"
        y_pred.append(sentiment_part)
    
    y_pred_final = []
    count = 0
    case_issue=0
    for i in y_pred:
        if i != "POSITIVE" and i!= "NEGATIVE":
            if i == 'positive' or i == 'negative':
                y_pred_final.append(i.upper())
                case_issue+=1
            else:
                y_pred_final.append("POSITIVE")
                count += 1
        else:
            y_pred_final.append(i)
    print(f"Total rouge output: {count}")
    print(f"Total Case Issue: {case_issue}")

        
        
    
    return y_pred_final

In [None]:
def prepare_dataset(dataset):
    
    def format_prompt(example):
        language_name = map_dict.get(example['language'], example['language'])
        """
        formatted_text = (
            f"Analyze the sentiment of the following {language_name} text:\n"
            f"Text: {example['sentence']}\n"
            f"Sentiment: {example['label']}"
        )
        """


        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a helpful assistant that analyzes sentiment in text.<|eot_id|><|start_header_id|>user<|end_header_id|>
        Analyze the sentiment of the following {language_name} text and respond with exactly one word (either 'positive' or 'negative'):
        {example['sentence']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
        <|response_tag|> {example['label']} <|eom_id|>"""
        #return {"text": formatted_text}
        return tokenizer(
            prompt, 
            padding="max_length",  # Ensures uniform length
            truncation=True,       # Avoids exceeding max length
            max_length=1024,       # Matches model max length
            return_tensors="pt"
        )
    
    # Map the formatting function across the dataset
    formatted_dataset = dataset.map(format_prompt)#, batched = True)
    return formatted_dataset

# Format datasets
train_data = prepare_dataset(train_dataset)
eval_data = prepare_dataset(val_dataset)

In [None]:
!pip -q install trl

In [None]:
print(model)

In [None]:
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig
import os 
# Define output directory
output_dir = "trained_weights"
os.makedirs(output_dir, exist_ok=True)
# Memory-efficient LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,  # Reduced from 64 to save memory
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# Memory-efficient Training Arguments
training_arguments = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=1,  # Keep at 1 for memory efficiency
    gradient_accumulation_steps=4,   # Reduced from 8 to speed up training
    gradient_checkpointing=True,     # Keep this for memory efficiency
    optim="paged_adamw_32bit",
    save_steps=0,
    # Increased to reduce logging overhead
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to=None,
    evaluation_strategy="no",     # Disabled evaluation to save memory
    max_seq_length = 1024,
    packing = True,
    dataset_text_field="sentence",
    logging_strategy="epoch",
    #padding = True,
    #truncation = True,
    #output_dir = output_dir,
    dataset_kwargs = {
        "add_special_tokens" : False,
        "append_concat_token" : False,
    }
)

# Initialize SFT Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    #training_args=SFTConfig(max_seq_length = 1024)
    train_dataset=train_data,
    eval_dataset = eval_data,
    peft_config=peft_config,
    #dataset_text_field="prompt",
    tokenizer=tokenizer,
)

In [None]:
import wandb
wandb.login(key='3ecf0adab295dff557ad09ec279a375e3131ad74')


In [None]:
import wandb
wandb.init(project="llamasftmsa")

In [None]:
trainer.train()

In [None]:
trainer.save_model()  # Save model and tokenizer
tokenizer.save_pretrained(output_dir)  # Save tokenizer (optional)

In [None]:
# Print the overall training state:
print("Global step:", trainer.state.global_step)
print("Epoch:", trainer.state.epoch)

# See all logged metrics (each log is a dict with keys such as loss, learning_rate, etc.)
print("Log history:", trainer.state.log_history)

In [None]:
import gc

del [
    model, 
     tokenizer, 
     peft_config, 
    trainer, 
    train_data, 
    eval_data, 
    bnb_config, 
    
    training_arguments
]
# del [df, X_train, X_eval]
del [
    TrainingArguments, 
    SFTTrainer, 
    LoraConfig, 
    BitsAndBytesConfig
]

In [None]:
for _ in range(100):
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
!nvidia-smi

In [None]:
from transformers import (AutoConfig,
                          AutoModelForCausalLM, 
                          AutoTokenizer,
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline) 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Specify model name and path
model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"
peft_model_id = "/kaggle/working/trained_weights"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

# Loading the model and tokenizer

model = AutoModelForCausalLM.from_pretrained(model_path,quantization_config=bnb_config,
                                             device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    model_max_length=1024,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(len(tokenizer))

In [None]:
# Load the PEFT adapter
model.load_adapter(peft_model_id)

# Configure model settings
model.config.use_cache = False
model.config.pretraining_tp = 1

# Ensure the model is in evaluation mode
model.eval()

print("Model and tokenizer loaded successfully.")

In [None]:
import torch
def generate_response(prompt,  max_length=200):
    # Tokenize input prompt and move to the correct device
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate text
    with torch.no_grad():
        output = model.generate(**inputs)

    # Decode and return generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)
lang = 'bn'
beng_sen = list(val_df[val_df["language"] == lang]['sentence'])

print(beng_sen[5])
prompt = f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful AI assistant for sentiment analysis(POSITIVE or NEGATIVE)<|eot_id|>
<|start_header_id|>user<|end_header_id|>

Predict the sentiment of this sentence: {beng_sen[5]}
Output Format: POSITIVE or NEGATIVE
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""

print(generate_response(prompt))

In [None]:
#sample_val = val_df.sample(n = 20)

In [None]:
#y_pred = predict_sentiment(sample_val, model,tokenizer)

In [None]:
import numpy as np
#y_true_val = list(sample_val['label'].copy())

#evaluate_binary_sentiment(y_true_val, y_pred)

In [None]:
test_file_path = "/kaggle/input/multi-lingual-sentiment-analysis/test.csv"

test_df = pd.read_csv(test_file_path)
test_df.head()

In [None]:
test_dataframe = pd.DataFrame(test_df)
test_dataset = Dataset.from_pandas(test_dataframe)

In [None]:
predictions = predict_sentiment(test_dataframe, model, tokenizer)

In [None]:
predictions_df = pd.DataFrame({
    'ID': test_df['ID'],
    #'sentence': test_df['sentence'],
    'label': predictions
})

# Capitalize the first letter of each label
predictions_df['label'] = predictions_df['label'].str.capitalize()

predictions_df.head()

In [None]:
# Save predictions if needed
predictions_df.to_csv('submission.csv', index=False)