In [None]:
!pip install transformers

In [None]:
import pandas as pd
import torch
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

# Load the training dataset


In [None]:
import pandas as pd
import torch
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Load the training dataset
train_dataset = pd.read_csv('train.csv')  # Replace 'train_dataset.csv' with the actual file name

# Load the validation dataset
val_dataset = pd.read_csv('dev_data.csv')  # Replace 'validation_dataset.csv' with the actual file name

# Separate the text and label columns in the training dataset
train_texts = train_dataset['Text_data'].tolist()
train_labels = train_dataset['Label'].tolist()

# Separate the text and label columns in the validation dataset
val_texts = val_dataset['Text_data'].tolist()
val_labels = val_dataset['Label'].tolist()

# Encode the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

# Define the number of classes
num_classes = len(label_encoder.classes_)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts and convert them to input features
train_input_ids = []
train_attention_masks = []

for text in train_texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

val_input_ids = []
val_attention_masks = []

for text in val_texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

# Convert the input features and labels to tensors
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(val_labels)

# Create a TensorDataset
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

# Create DataLoader for training and validation sets
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)

    for batch in progress_bar:
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation loop
    model.eval()
    val_predictions = []
    val_true_labels = []

    for batch in val_dataloader:
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_masks)

        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        _, predicted_labels = torch.max(probabilities, dim=1)

        val_predictions.extend(predicted_labels.cpu().tolist())
        val_true_labels.extend(batch_labels.cpu().tolist())

    # Generate classification report
    report = classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_)
    print('Classification Report:\n', report)
    print('-----------------------------')

# Save the trained model
torch.save(model.state_dict(), 'bert_model.pth')

# Load the saved model for prediction on test data
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
model.load_state_dict(torch.load('bert_model.pth'))
model.to(device)

# Load the test dataset
test_dataset = pd.read_csv('test_data.csv')  # Replace 'test_data.csv' with the actual file name
test_texts = test_dataset['Text_data'].fillna('').tolist()
# Separate the text column in the test dataset
#test_texts = test_dataset['Text_data'].tolist()

# Tokenize the texts in the test dataset
test_input_ids = []
test_attention_masks = []

for text in test_texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

# Convert the input features to tensors
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

# Create DataLoader for the test set
test_dataloader = DataLoader(TensorDataset(test_input_ids, test_attention_masks), batch_size=batch_size, shuffle=False)

# Put the model in evaluation mode
model.eval()

# Predict labels for the test dataset
test_predictions = []

for batch in test_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    _, predicted_labels = torch.max(probabilities, dim=1)

    test_predictions.extend(predicted_labels.cpu().tolist())

# Decode the predicted labels using the label encoder
test_predicted_labels = label_encoder.inverse_transform(test_predictions)

# Add the predicted labels as a column in the test dataset
test_dataset['predicted_label'] = test_predicted_labels



In [None]:
test_dataset

In [None]:


# Save the test dataset with predicted labels
test_dataset.to_csv('test_predictions.csv', index=False)


In [None]:
test_dataset

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Read the predicted CSV file
predicted_df = pd.read_csv('test_predictions.csv')

# Read the actual labels CSV file
actual_df = pd.read_csv('actual_test_data.csv')

# Merge the two dataframes on the text column
merged_df = pd.merge(predicted_df, actual_df, on='Text_data')

# Extract the predicted and actual labels
predicted_labels = merged_df['predicted_label']
actual_labels = merged_df['Label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)
print('Accuracy:', accuracy)

# Generate classification report
report = classification_report(actual_labels, predicted_labels)
print('Classification Report:')
print(report)
