In [1]:
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.cuda.amp import GradScaler, autocast
from tqdm.auto import tqdm

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Check if the tokenizer has a padding token, if not, set one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Load the dataset
data = pd.read_csv("../Dataset_with_Features/dataset_420464.csv")

# Separate features
text_data = data["url"]
numerical_data = data.drop(columns=["url", "label"])

# Preprocess numerical data using StandardScaler
scaler = StandardScaler()
X_num = scaler.fit_transform(numerical_data)

# Tokenize text data
max_length = 128
tokenized_data = tokenizer(text_data.tolist(), padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")
X_text = tokenized_data["input_ids"]

# Prepare labels
y = data["label"].apply(lambda x: 1 if x == "bad" else 0)

# Split dataset
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(X_text, X_num, y, test_size=0.2, random_state=42)

# Create TensorDatasets
train_dataset = TensorDataset(torch.tensor(X_train_text), torch.tensor(X_train_num, dtype=torch.float), torch.tensor(y_train.values, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test_text), torch.tensor(X_test_num, dtype=torch.float), torch.tensor(y_test.values, dtype=torch.long))

# DataLoader setup
batch_size = 16  # Reduced batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training settings
device = torch.device("cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
epochs = 6
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Initialize the gradient scaler for mixed precision
scaler = GradScaler()

# Gradient Accumulation Settings
accumulation_steps = 4

# Training loop with mixed precision and gradient accumulation
for epoch in range(epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")):
        batch = [b.to(device) for b in batch]
        texts, nums, labels = batch

        with autocast():
            outputs = model(texts, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_loader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {average_loss}")

    # Evaluation
    model.eval()
    total_eval_accuracy = 0
    for batch in test_loader:
        batch = [b.to(device) for b in batch]
        texts, nums, labels = batch

        with torch.no_grad(), autocast():
            outputs = model(texts, labels=labels)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).flatten()
        accuracy = (predictions == labels).cpu().numpy().mean()
        total_eval_accuracy += accuracy

    average_accuracy = total_eval_accuracy / len(test_loader)
    print(f"Accuracy: {average_accuracy}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  train_dataset = TensorDataset(torch.tensor(X_train_text), torch.tensor(X_train_num, dtype=torch.float), torch.tensor(y_train.values, dtype=torch.long))
  test_dataset = TensorDataset(torch.tensor(X_test_text), torch.tensor(X_test_num, dtype=torch.float), torch.tensor(y_test.values, dtype=torch.long))


Epoch 1/6:   0%|          | 0/21024 [00:00<?, ?it/s]

AssertionError: 