In [3]:
!pip uninstall pyarrow -y

Found existing installation: pyarrow 14.0.1
Uninstalling pyarrow-14.0.1:
  Successfully uninstalled pyarrow-14.0.1


In [4]:
!pip install pyarrow==14.0.1

Collecting pyarrow==14.0.1
  Using cached pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Using cached pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-14.0.1


In [5]:
!pip install torch transformers accelerate



In [6]:
!pip install --upgrade peft



In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig

In [7]:
df = pd.read_csv('IMDB Dataset.csv', on_bad_lines='skip', delimiter=',', quoting=3)
df = df.sample(frac=0.1, random_state=42)

In [8]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Map sentiment to labels
df.dropna(inplace=True)  # Remove missing values

In [35]:
# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['review'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

In [9]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name,num_labels=1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
target_modules = [
    "distilbert.transformer.layer.0.attention.q_lin",
    "distilbert.transformer.layer.0.attention.k_lin",
    "distilbert.transformer.layer.0.attention.v_lin",
    "distilbert.transformer.layer.0.attention.out_lin",
    "distilbert.transformer.layer.0.ffn.lin1",
    "distilbert.transformer.layer.0.ffn.lin2",

    "distilbert.transformer.layer.1.attention.q_lin",
    "distilbert.transformer.layer.1.attention.k_lin",
    "distilbert.transformer.layer.1.attention.v_lin",
    "distilbert.transformer.layer.1.attention.out_lin",
    "distilbert.transformer.layer.1.ffn.lin1",
    "distilbert.transformer.layer.1.ffn.lin2",

    "distilbert.transformer.layer.2.attention.q_lin",
    "distilbert.transformer.layer.2.attention.k_lin",
    "distilbert.transformer.layer.2.attention.v_lin",
    "distilbert.transformer.layer.2.attention.out_lin",
    "distilbert.transformer.layer.2.ffn.lin1",
    "distilbert.transformer.layer.2.ffn.lin2",

    "distilbert.transformer.layer.3.attention.q_lin",
    "distilbert.transformer.layer.3.attention.k_lin",
    "distilbert.transformer.layer.3.attention.v_lin",
    "distilbert.transformer.layer.3.attention.out_lin",
    "distilbert.transformer.layer.3.ffn.lin1",
    "distilbert.transformer.layer.3.ffn.lin2",

    "distilbert.transformer.layer.4.attention.q_lin",
    "distilbert.transformer.layer.4.attention.k_lin",
    "distilbert.transformer.layer.4.attention.v_lin",
    "distilbert.transformer.layer.4.attention.out_lin",
    "distilbert.transformer.layer.4.ffn.lin1",
    "distilbert.transformer.layer.4.ffn.lin2",

    "distilbert.transformer.layer.5.attention.q_lin",
    "distilbert.transformer.layer.5.attention.k_lin",
    "distilbert.transformer.layer.5.attention.v_lin",
    "distilbert.transformer.layer.5.attention.out_lin",
    "distilbert.transformer.layer.5.ffn.lin1",
    "distilbert.transformer.layer.5.ffn.lin2"
]



lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type="SEQ_CLS",
    target_modules=target_modules
)


In [11]:
model = get_peft_model(model, lora_config)

In [12]:
def tokenize_data(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512)

In [13]:
# Apply tokenization
tokenized_data = df['review'].apply(tokenize_data)
df['input_ids'] = tokenized_data.apply(lambda x: x['input_ids'])
df['attention_mask'] = tokenized_data.apply(lambda x: x['attention_mask'])

In [14]:
class IMDbDataset(Dataset):
    def __init__(self, df):
        self.input_ids = torch.tensor(df['input_ids'].tolist())
        self.attention_mask = torch.tensor(df['attention_mask'].tolist())
        self.labels = torch.tensor(df['label'].tolist())

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [15]:
# Splitting the dataset into train and validation sets
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
# Create dataset objects for training and validation
train_dataset = IMDbDataset(train_df)
val_dataset = IMDbDataset(val_df)

In [24]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Ensure logits and labels are tensors
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.tensor(labels)

    # Apply torch.argmax to get the predicted class
    predictions = torch.argmax(logits, dim=-1)

    # Move tensors back to NumPy for accuracy computation
    predictions = predictions.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()

    # Compute accuracy
    accuracy = accuracy_score(labels, predictions)

    return {"accuracy": accuracy}

In [28]:
from transformers import TrainingArguments

# Define training arguments with reduced values
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=1,  # Total number of training epochs
    per_device_train_batch_size=8,  # Reduced batch size for training
    per_device_eval_batch_size=8,  # Reduced batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    logging_dir='./logs',  # Directory for storing logs
    evaluation_strategy="no",  # Evaluate every epoch
    gradient_accumulation_steps=2,  # Gradient accumulation
)




In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,  # Add the metrics function here
)


In [30]:
# Start training
trainer.train()

Step,Training Loss


TrainOutput(global_step=2, training_loss=0.6167510747909546, metrics={'train_runtime': 69.4528, 'train_samples_per_second': 0.475, 'train_steps_per_second': 0.029, 'total_flos': 4362244128768.0, 'train_loss': 0.6167510747909546, 'epoch': 0.8})

In [31]:
# Save the model
trainer.save_model("./trained_model")

In [34]:
# Prediction function
def predict(texts):
    # Tokenize the input texts
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    # Ensure the model is in evaluation mode
    model.eval()

    # Move tensors to the same device as the model
    inputs = {key: val.to(model.device) for key, val in encodings.items()}

    # Run the model on the input data
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted probabilities (logits) and convert them to probabilities
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)

    # Get predicted classes (0 for negative, 1 for positive)
    predictions = torch.argmax(probabilities, dim=1).cpu().numpy()

    return predictions


In [36]:
# Testing the prediction on the test dataset
sample_reviews = test_texts[:5]  # Use a small sample of test data for prediction
predictions = predict(sample_reviews)

In [37]:
# Print predictions
for review, prediction in zip(sample_reviews, predictions):
    sentiment = "positive" if prediction == 1 else "negative"
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")

Review:  and find intriguing points of interest in the films of all genres from the Thirties and Forties."
Predicted Sentiment: negative

Review:  everyone was!! all this chatting and now i feel like watching it! i think i will"
Predicted Sentiment: negative

Review:  I will give the grade of what I thought when I first saw it.<br /><br />8/10"
Predicted Sentiment: negative

Review:  & the title song)"
Predicted Sentiment: negative

Review:  low-everything. The very final scene-and I mean about the final 10 seconds of the film-is the ONLY mildly creative or interesting moment.<br /><br />I paid $3.45 to rent this. I could have better spent it on a hamburger!"
Predicted Sentiment: negative

