In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.1/transformers/8b-instruct/1/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/1/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/1/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/special_tokens_map.json
/kaggle/input/llama-3.1/transformers/8b-instruct/1/model-00002-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/1/generation_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/tra

In [None]:
%%capture
!pip install bitsandbytes
!pip install unsloth
!pip install accelerate
!pip install peft
!pip install torch==2.1.2 --force-reinstall
!pip install --upgrade transformers
!pip install --upgrade unsloth

In [None]:
import torch._dynamo
print(torch.__version__)

In [None]:
from datetime import datetime
from unsloth import FastLanguageModel
import torch
import pandas as pd
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, PeftModel, get_peft_model
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [None]:
train_file_path = "/kaggle/input/multi-lingual-sentiment-analysis/train.csv"
test_file_path = "/kaggle/input/multi-lingual-sentiment-analysis/test.csv"
model_path = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

RANDOM_SEED=1971

In [None]:

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
lang_map = {
    "as": "Assamese",
    "bd": "Bodo",
    "bn": "Bengali",
    "gu": "Gujarati",
    "hi": "Hindi",
    "kn": "Kannada",
    "ml": "Malayalam",
    "mr": "Marathi",
    "or": "Odia",
    "pa": "Punjabi",
    "ta": "Tamil",
    "te": "Telugu",
    "ur": "Urdu"
}

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# Adding LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = RANDOM_SEED,  #3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Predict the sentiment of {} language sentence as 1 (positive) or 0 (negative), output 0 or 1.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    texts = []
    num_labels = []
    sentences = examples['sentence']
    languages = examples['language']
    labels = examples['label']
    for sentence, language, label in zip(sentences, languages, labels):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        label_01 = 1 if label == 'Positive' else 0
        text = alpaca_prompt.format(lang_map[language], sentence, label_01) + EOS_TOKEN
        texts.append(text)
        num_labels.append(label_01)
    return { "text" : texts, "label": num_labels}



In [None]:
dataset = load_dataset("csv", data_files = [train_file_path])["train"]
dataset = dataset.map(formatting_prompts_func, batched = True,)

split_dataset = dataset.train_test_split(test_size = 0.2, seed = 62)
train_dataset, valid_dataset = split_dataset['train'], split_dataset['test']

In [None]:
train_dataset, valid_dataset

In [None]:
train_dataset[0], valid_dataset[0]

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 5, # Set this for 1 full training run.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
# Inference Test
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
example = train_dataset[0]
inputs = tokenizer(
[
    alpaca_prompt.format(lang_map[example['language']], example['sentence'], '')
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
result = tokenizer.batch_decode(outputs)[0].split('Response')[-1].split('\n')[1][0]

In [None]:
result, example['label']

In [None]:
FastLanguageModel.for_inference(model)
def apply_inference(examples):
    results = []
    sentences = examples['sentence']
    languages = examples['language']
    inputs = tokenizer(
        [
            alpaca_prompt.format(lang_map[languages], sentences, '')
        ], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    result = int(tokenizer.batch_decode(outputs)[0].split('Response')[-1].split('\n')[1][0])
    return {'result': result}



In [None]:
train_dataset = train_dataset.map(apply_inference)
valid_dataset = valid_dataset.map(apply_inference)

In [None]:
def ComputeF1(actual, predicted):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            if actual[i] == 1: tp += 1
            else: tn += 1
        else:
            if actual[i] == 1: fn += 1
            else: fp += 1
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    accuracy = (tp+tn) / (tp+tn+fp+fn)
    f1_score = (2 * precision * recall) / (precision+recall)
    print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')
    print(f'F1 Score: {f1_score}')

In [None]:
print('Training Data Set Performance')
ComputeF1(train_dataset[:]['label'], train_dataset[:]['result'])
print('Validation Data Set Performance')
ComputeF1(valid_dataset[:]['label'], valid_dataset[:]['result'])

In [None]:
model.save_pretrained("lora_model_2048") # Local saving
tokenizer.save_pretrained("lora_model_2048")

In [None]:
test_dataset = load_dataset("csv", data_files = [test_file_path])["train"]
test_dataset = test_dataset.map(get_results)

In [None]:
predictions = test_dataset[:]['result']
predictions = ['Positive' if predictions[i] == 1 else 'Negative' for i in range(len(predictions))]
len(predictions)

In [None]:
dl = pd.DataFrame({'ID': list(range(1,len(predictions)+1)), 'label': predictions})
dl.to_csv('submission.csv', index=False)