In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torch.cuda.amp import GradScaler, autocast

In [2]:
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "meta-llama/Llama-2-7b-hf", num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Preprocessing step
# Check if the tokenizer has a padding token, if not, set one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
# IMPORTANT: Update the model configuration to recognize the new padding token
model.config.pad_token_id = tokenizer.pad_token_id

# Load the dataset
data = pd.read_csv("../Dataset_with_Features/dataset_10000.csv")

# Separate features
text_data = data["url"]
numerical_data = data.drop(columns=["url", "label"])

# Preprocess numerical data
scaler = StandardScaler()
X_num = scaler.fit_transform(numerical_data)

# Define a max_length for tokenization
max_length = 128  # Example value, you should choose based on your data

# Preprocess text data with a progress bar
print("Tokenizing text data...")
tokenized_texts = []
for text in tqdm(text_data.tolist()):
    tokenized_text = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    tokenized_texts.append(tokenized_text["input_ids"][0])

# Make sure to tokenize with padding
tokenized_data = tokenizer(
    text_data.tolist(),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt",
)  # Adjust max_length as needed
X_text = tokenized_data["input_ids"]

# Labels
y = data["label"].apply(lambda x: 1 if x == "bad" else 0)

# Split dataset
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    X_text, X_num, y, test_size=0.2, random_state=42
)

# Create TensorDatasets
train_dataset = TensorDataset(
    X_train_text,
    torch.tensor(X_train_num, dtype=torch.float),
    torch.tensor(y_train.values, dtype=torch.long),
)
test_dataset = TensorDataset(
    X_test_text,
    torch.tensor(X_test_num, dtype=torch.float),
    torch.tensor(y_test.values, dtype=torch.long),
)

Tokenizing text data...


100%|██████████| 10000/10000 [00:01<00:00, 6749.89it/s]


In [4]:
# Define collate function
def collate_batch(batch):
    texts, nums, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=tokenizer.pad_token_id)
    nums = torch.stack(nums)
    labels = torch.tensor(labels)
    return texts, nums, labels

In [5]:
# DataLoader and training setup
batch_size = 16
epochs = 6
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [6]:
# Training settings
device = torch.device("cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)



In [7]:
# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        desc=f"Epoch {epoch + 1}/{epochs}",
    )

    for step, (texts, nums, labels) in progress_bar:
        # Note: You need to modify this part to handle both text and numerical data
        # Currently, it only feeds the text data to the model.
        # Implementing a custom model that can handle both text and numerical data is required.

        texts = texts.to(device)
        labels = labels.to(device)

        model.zero_grad()
        outputs = model(texts, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": total_loss / (step + 1)})

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} | Loss: {average_loss}")




Epoch 1/6:  40%|████      | 202/500 [4:49:18<7:00:40, 84.70s/it, loss=0.495]

In [None]:
# Evaluation
model.eval()
all_predictions, all_true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = [item.to(device) for item in batch]
        inputs, labels = batch
        outputs = model(inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        labels = labels.cpu().numpy()
        all_predictions.extend(predictions)
        all_true_labels.extend(labels)

accuracy = accuracy_score(all_true_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
    all_true_labels, all_predictions, average="binary"
)
metrics_df = pd.DataFrame(
    [[accuracy, precision, recall, f1]],
    columns=["Accuracy", "Precision", "Recall", "F1 Score"],
)
print(metrics_df)

# # Save the model and tokenizer
# model.save_pretrained("path/to/save/model")
# tokenizer.save_pretrained("path/to/save/model")