In [None]:
!pip uninstall pyarrow -y

Found existing installation: pyarrow 14.0.2
Uninstalling pyarrow-14.0.2:
  Successfully uninstalled pyarrow-14.0.2


In [None]:
!pip install pyarrow==14.0.1

Collecting pyarrow==14.0.1
  Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-14.0.1


In [None]:
!pip install torch transformers accelerate



In [None]:
!pip install --upgrade peft

Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.0-py3-none-any.whl (322 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.5/322.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.0


In [None]:
!git clone https://github.com/Preetham-103/QLoRA.git

Cloning into 'QLoRA'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [None]:
import torch
import pandas as pd
from transformers import DistilBertTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('IMDB Dataset.csv', on_bad_lines='skip')
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Map sentiment to labels
df.dropna(inplace=True)  # Remove missing values

In [None]:
# Split into training and validation sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['review'], df['label'], test_size=0.2, random_state=42)


In [None]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



In [None]:
# Tokenization function
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

In [None]:
# Tokenize the datasets
train_encodings = tokenize_function(train_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

In [None]:
# Convert to Torch dataset
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = IMDbDataset(train_encodings, train_labels.tolist())
test_dataset = IMDbDataset(test_encodings, test_labels.tolist())

In [None]:
# Load DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set up LoRA config for QLoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # This is for sequence classification
    r=8,  # LoRA rank parameter
    lora_alpha=32,
    target_modules = [
    "distilbert.transformer.layer.0.attention.q_lin",
    "distilbert.transformer.layer.0.attention.k_lin",
    "distilbert.transformer.layer.0.attention.v_lin",
    "distilbert.transformer.layer.0.attention.out_lin",
    "distilbert.transformer.layer.0.ffn.lin1",
    "distilbert.transformer.layer.0.ffn.lin2",

    "distilbert.transformer.layer.1.attention.q_lin",
    "distilbert.transformer.layer.1.attention.k_lin",
    "distilbert.transformer.layer.1.attention.v_lin",
    "distilbert.transformer.layer.1.attention.out_lin",
    "distilbert.transformer.layer.1.ffn.lin1",
    "distilbert.transformer.layer.1.ffn.lin2",

    "distilbert.transformer.layer.2.attention.q_lin",
    "distilbert.transformer.layer.2.attention.k_lin",
    "distilbert.transformer.layer.2.attention.v_lin",
    "distilbert.transformer.layer.2.attention.out_lin",
    "distilbert.transformer.layer.2.ffn.lin1",
    "distilbert.transformer.layer.2.ffn.lin2",

    "distilbert.transformer.layer.3.attention.q_lin",
    "distilbert.transformer.layer.3.attention.k_lin",
    "distilbert.transformer.layer.3.attention.v_lin",
    "distilbert.transformer.layer.3.attention.out_lin",
    "distilbert.transformer.layer.3.ffn.lin1",
    "distilbert.transformer.layer.3.ffn.lin2",

    "distilbert.transformer.layer.4.attention.q_lin",
    "distilbert.transformer.layer.4.attention.k_lin",
    "distilbert.transformer.layer.4.attention.v_lin",
    "distilbert.transformer.layer.4.attention.out_lin",
    "distilbert.transformer.layer.4.ffn.lin1",
    "distilbert.transformer.layer.4.ffn.lin2",

    "distilbert.transformer.layer.5.attention.q_lin",
    "distilbert.transformer.layer.5.attention.k_lin",
    "distilbert.transformer.layer.5.attention.v_lin",
    "distilbert.transformer.layer.5.attention.out_lin",
    "distilbert.transformer.layer.5.ffn.lin1",
    "distilbert.transformer.layer.5.ffn.lin2"
],  # The target layers for LoRA
    lora_dropout=0.1,
    bias="none"
)

In [None]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size per device
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    logging_dir='./logs',  # Logging directory
    evaluation_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",  # Save model per epoch
)



In [None]:
# Evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# Start training
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# Save the model
trainer.save_model("./trained_model")

In [None]:
# Prediction function
def predict(texts):
    # Tokenize the input texts
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    # Ensure the model is in evaluation mode
    model.eval()

    # Move tensors to the same device as the model
    inputs = {key: val.to(model.device) for key, val in encodings.items()}

    # Run the model on the input data
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted probabilities (logits) and convert them to probabilities
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)

    # Get predicted classes (0 for negative, 1 for positive)
    predictions = torch.argmax(probabilities, dim=1).cpu().numpy()

    return predictions


In [None]:
# Testing the prediction on the test dataset
sample_reviews = test_texts[:5]  # Use a small sample of test data for prediction
predictions = predict(sample_reviews)

In [None]:
# Print predictions
for review, prediction in zip(sample_reviews, predictions):
    sentiment = "positive" if prediction == 1 else "negative"
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")

Review:  and find intriguing points of interest in the films of all genres from the Thirties and Forties."
Predicted Sentiment: negative

Review:  everyone was!! all this chatting and now i feel like watching it! i think i will"
Predicted Sentiment: negative

Review:  I will give the grade of what I thought when I first saw it.<br /><br />8/10"
Predicted Sentiment: negative

Review:  & the title song)"
Predicted Sentiment: negative

Review:  low-everything. The very final scene-and I mean about the final 10 seconds of the film-is the ONLY mildly creative or interesting moment.<br /><br />I paid $3.45 to rent this. I could have better spent it on a hamburger!"
Predicted Sentiment: negative

