In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import torch

# Load your dataset
df = pd.read_excel('NLP.xlsx')  # Replace 'NLP.xlsx' with the path to your Excel dataset

# Drop unnecessary columns
df.drop(df.columns[df.columns.str.startswith('Unnamed:')], axis=1, inplace=True)

# Drop rows with missing values in description column
df.dropna(subset=['description'], inplace=True)

# Convert non-string values to strings in the 'description' column
df['description'] = df['description'].astype(str)

# Define input text and labels
X = df['description']
y = df[['inclusion_criteria', 'incident_type', 'receiver_name', 'receiver_country', 'receiver_category', 'initiator_name', 'initiator_category', 'attributing_actor', 'attributed_initiator', 'zero_days', 'MITRE_initial_access', 'MITRE_impact', 'user_interaction', 'has_disruption', 'data_theft', 'disruption', 'hijacking', 'target_multiplier', 'impact_indicator']]

# Print original length of y
print("Original length of y:", len(y))

# Convert labels to the appropriate data type
for column in y.columns:
    y[column] = pd.to_numeric(y[column], errors='coerce')

# Drop rows with missing values in labels
y.dropna(inplace=True)

# Print final length of y after dropping missing values
print("Final length of y after preprocessing:", len(y))

# Reset indices to ensure alignment
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Print lengths of X and y after reset indices
print("Length of X:", len(X))
print("Length of y:", len(y))

# Ensure the number of samples in X matches the number of samples in y
if len(X) != len(y):
    raise ValueError("Number of samples in input text and labels do not match!")

# Convert labels to multi-hot encoding
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(y.values)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=labels.shape[1])

# Tokenize input text
inputs = tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")

# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float32)

# Split data into train and test sets
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_inputs,
    eval_dataset=test_inputs,
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_inputs)

# Print classification report
print(classification_report(test_labels, predictions.predictions > 0.5, target_names=mlb.classes_))


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import torch
from torch.utils.data import TensorDataset, DataLoader

# Load your dataset
df = pd.read_excel('NLP.xlsx')  # Replace 'NLP.xlsx' with the path to your Excel dataset

# Drop unnecessary columns
df.drop(df.columns[df.columns.str.startswith('Unnamed:')], axis=1, inplace=True)

# Drop rows with missing values in description column
df.dropna(subset=['description'], inplace=True)

# Convert non-string values to strings in the 'description' column
df['description'] = df['description'].astype(str)

# Define input text and labels
X = df['description']
y = df[['inclusion_criteria', 'incident_type', 'receiver_name', 'receiver_country']]

# Convert non-numeric values to strings
for column in y.columns:
    y[column] = y[column].astype(str)

# Convert labels to multi-hot encoding
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(y.values)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input text
inputs = tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")

# Ensure inputs and labels have the same number of samples
assert inputs.input_ids.shape[0] == labels.shape[0], "Number of samples in inputs and labels do not match!"

# Split data into train and test sets
train_indices, test_indices = train_test_split(range(len(X)), test_size=0.2, random_state=42)
train_inputs = {key: value[train_indices] for key, value in inputs.items()}
train_labels = labels[train_indices]
test_inputs = {key: value[test_indices] for key, value in inputs.items()}
test_labels = labels[test_indices]

# Create PyTorch DataLoader
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], torch.tensor(train_labels, dtype=torch.float))
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], torch.tensor(test_labels, dtype=torch.float))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_))

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Adjust number of epochs as needed
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(torch.sigmoid(outputs.logits).cpu().numpy())
        true_labels.extend(targets.cpu().numpy())

predictions = torch.tensor(predictions) > 0.5

# Print classification report
print(classification_report(true_labels, predictions, target_names=mlb.classes_))


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Load your dataset
df = pd.read_excel('NLP.xlsx')  # Replace 'NLP.xlsx' with the path to your Excel dataset

# Drop unnecessary columns
df.drop(df.columns[df.columns.str.startswith('Unnamed:')], axis=1, inplace=True)

# Drop rows with missing values in description column
df.dropna(subset=['description'], inplace=True)

# Convert non-string values to strings in the 'description' column
df['description'] = df['description'].astype(str)

# Define input text and labels
X = df['description']
y = df[['inclusion_criteria', 'incident_type', 'receiver_name', 'receiver_country']]

# Convert non-numeric values to strings
for column in y.columns:
    y[column] = y[column].astype(str)

# Convert labels to multi-hot encoding
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(y.values)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input text
inputs = tokenizer(X.tolist(), padding=True, truncation=True, return_tensors="pt")

# Ensure inputs and labels have the same number of samples
assert inputs.input_ids.shape[0] == labels.shape[0], "Number of samples in inputs and labels do not match!"

# Sample a subset of the dataset
X_sampled, _, y_sampled, _ = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert non-numeric values in labels to numeric
label_encoder = LabelEncoder()
labels_numeric = label_encoder.fit_transform(y_sampled.values)

# Split data into train and test sets
train_indices, test_indices = train_test_split(range(len(X)), test_size=0.2, random_state=42)

# Split inputs and labels based on indices
train_inputs = {key: value[train_indices] for key, value in inputs.items()}
train_labels = labels[train_indices]
test_inputs = {key: value[test_indices] for key, value in inputs.items()}
test_labels = labels[test_indices]

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(y.columns))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize input text
inputs = tokenizer(X_sampled.tolist(), padding=True, truncation=True, return_tensors="pt")
labels = y_sampled.values

# Convert labels to numpy array
labels_array = y_sampled.values

# Create PyTorch DataLoader
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels, dtype=torch.float))
sampler = RandomSampler(dataset)
batch_size = 4  # Adjust batch_size as needed
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# Training loop
model.train()
for epoch in range(3):  # Adjust number of epochs as needed
    for step, batch in enumerate(dataloader):
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item()}")

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(torch.sigmoid(outputs.logits).cpu().numpy())
        true_labels.extend(targets.cpu().numpy())

predictions = torch.tensor(predictions) > 0.5

# Print classification report
print(classification_report(true_labels, predictions, target_names=mlb.classes_))
