In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence


In [16]:
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
#Preprocessing step
# Load the dataset
data = pd.read_csv('../Dataset_with_Features/dataset_420464.csv')
# Assuming all features except 'url' and 'label' are numerical
numerical_cols = [col for col in data.columns if col not in ['url', 'label']]
# Preprocessing for numerical data
scaler = StandardScaler()

# Separate the dataset into features and labels
X = data[numerical_cols]
y = data['label'].apply(lambda x: 1 if x == 'bad' else 0)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply scaling
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float), torch.tensor(y_train.values, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test_scaled, dtype=torch.float), torch.tensor(y_test.values, dtype=torch.long))


In [18]:
def collate_batch(batch):
    inputs, labels = zip(*batch)
    inputs = [tokenizer.encode(text, add_special_tokens=True) for text in inputs]
    inputs = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = torch.tensor(labels)
    return inputs, labels

In [19]:
# Reduce batch size
batch_size = 32  
epochs = 3

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [20]:
# Training settings
device = torch.device("cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*epochs)



In [21]:
# Training loop
# Ensure the correct retrieval of the model's embedding size
model_embedding_size = model.get_input_embeddings().num_embeddings

for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch + 1}/{epochs}")

    for step, (batch_data, labels) in progress_bar:
        inputs = {k: v.to(device) for k, v in batch_data.items()}  # Move input data to the device
        labels = labels.to(device)

        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': total_loss / (step + 1)})

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} | Loss: {average_loss}")


Epoch 1/3:   0%|          | 0/10512 [00:00<?, ?it/s]


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]