In [None]:
#!pip install huggingface_hub
#!huggingface-cli login
from huggingface_hub import HfFolder, whoami

hf_token = ''
HfFolder.save_token(hf_token)

user = whoami()
print(f"Logged in as {user['name']}")

In [None]:
#!pip install bitsandbytes
#!pip install trl
!pip install transformers --upgrade

In [None]:
!pip install -U bitsandbytes
# Import required libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split
from evaluate import load
from tqdm import tqdm 

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/sentiment-data/sentiment_data.csv')

# Split data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
df

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
!pip install pandas transformers datasets torch scikit-learn
# Optional libraries
!pip install bitsandbytes lora

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:


# Assuming your DataFrame is named 'df'

# Select the desired columns with dictionary comprehension
dataset = df[['Label', 'Text']].rename(columns={'Label': 'labels', 'Text': 'text'})

# Split the dataset into train and test
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Initialize tokenizer and model with quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.float16, 
    bnb_4bit_use_double_quant=False
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
model.config.pad_token_id = tokenizer.pad_token_id  # Set padding token

# Disable caching
model.config.use_cache = False

# Prepare the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)



In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig

# Assuming your DataFrame is named 'df'
dataset = df[['Label', 'Text']].rename(columns={'Label': 'labels', 'Text': 'text'})

# Split the dataset into train and test
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert the pandas DataFrames to Hugging Face datasets
train_data = Dataset.from_pandas(train_data)
test_data = Dataset.from_pandas(test_data)


# Set padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model.config.pad_token_id = tokenizer.pad_token_id  # Set padding token

# Disable caching
model.config.use_cache = False

# Prepare the data - Tokenizing the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# PEFT config (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"],  # Target layers to apply LoRA to
)

# Apply PEFT to the model
model = get_peft_model(model, lora_config)

# Training setup
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    remove_unused_columns=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_data)
predicted_labels = predictions.predictions.argmax(axis=-1)
accuracy = accuracy_score(test_data['labels'], predicted_labels)

print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from sklearn.model_selection import train_test_split

# Load your dataset (replace with your actual data loading)
df = pd.read_csv("your_dataset.csv")

# Prepare the dataset
dataset = df[['Label', 'Text']].rename(columns={'Label': 'labels', 'Text': 'text'})
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_test_data = test_data.map(tokenize_function, batched=True)

# Quantization Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)



# PEFT Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"]
)

# Apply PEFT to the model
model = get_peft_model(model, lora_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
!pip install transformers==4.46.0

