In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Parameters
max_length = 128  # or another appropriate value based on your data

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load dataset
data = pd.read_csv('../Dataset_with_Features/dataset_420464.csv')
text_data = data['url']
numerical_data = data.drop(columns=['url', 'label'])
y = data['label'].apply(lambda x: 1 if x == 'bad' else 0)

# Preprocess numerical data
scaler = StandardScaler()
X_num = scaler.fit_transform(numerical_data)

# Tokenize text data
tokenized_data = tokenizer(text_data.tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
X_text = tokenized_data['input_ids']

# Split dataset
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(X_text, X_num, y, test_size=0.2, random_state=42)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_text, torch.tensor(X_train_num, dtype=torch.float), torch.tensor(y_train.values, dtype=torch.long))
test_dataset = TensorDataset(X_test_text, torch.tensor(X_test_num, dtype=torch.float), torch.tensor(y_test.values, dtype=torch.long))

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [2]:
import torch.nn as nn

class CustomModel(nn.Module):
    def __init__(self, transformer_model_name, num_numerical_features, num_labels):
        super(CustomModel, self).__init__()
        self.transformer = AutoModelForSequenceClassification.from_pretrained(transformer_model_name, num_labels=num_labels)
        self.numerical_processor = nn.Linear(num_numerical_features, 64)  # Example layer
        self.classifier = nn.Linear(self.transformer.config.hidden_size + 64, num_labels)

    def forward(self, text, numerical_features, attention_mask=None):
        transformer_output = self.transformer(text, attention_mask=attention_mask).logits
        numerical_output = self.numerical_processor(numerical_features)
        combined = torch.cat((transformer_output, numerical_output), dim=1)
        logits = self.classifier(combined)
        return logits

# Instantiate model
model = CustomModel("meta-llama/Llama-2-7b-hf", X_train_num.shape[1], 2)
device = torch.device("cpu")
model.to(device)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)  # 3 epochs

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch + 1}/{epochs}")

    for step, (texts, nums, labels) in progress_bar:
        attention_masks = (texts != tokenizer.pad_token_id).long()
        texts, attention_masks, nums, labels = texts.to(device), attention_masks.to(device), nums.to(device), labels.to(device)

        model.zero_grad()
        outputs = model(texts, nums, attention_masks)
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss / (step + 1)})

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} | Loss: {average_loss}")

# Add validation loop if needed
