<a href="https://colab.research.google.com/github/Nikitha-Pillai/bert-ai-/blob/main/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================================
# 🚀 BERT Model - Binary Text Classification
# =============================================

!pip install -q transformers==4.57.0 datasets pandas openpyxl torch scikit-learn sentence-transformers

# ===================== Imports =====================
import os
import re
import numpy as np
import pandas as pd
from random import random
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util
import torch
from google.colab import drive

# ===================== Mount Google Drive =====================
drive.mount('/content/drive')

# ===================== Load Excel File =====================
file_path = "/content/drive/MyDrive/data2/combineddataset.xlsx"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at {file_path}")
else:
    print(f"✅ File found: {file_path}")

df = pd.read_excel(file_path)

# ===================== Preprocess Text =====================
def preprocess_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))  # remove non-ASCII
    text = re.sub(r'\s+', ' ', text).strip()         # remove extra spaces
    return ' '.join(text.split()[:500])              # limit to 500 words

df['subject'] = df['subject'].astype(str).apply(preprocess_text)
df['body'] = df['body'].astype(str).apply(preprocess_text)
df['text'] = (df['subject'] + " " + df['body']).str.strip()

df = df.dropna(subset=['text', 'label'])
df = df[df['text'].str.strip() != '']
df = df[~df['text'].str.contains('#ERROR!', na=False)]
df = df.drop_duplicates(subset=['text'])

# Optional: Add small label noise (5%)
def add_label_noise(label, noise_rate=0.05):
    if random() < noise_rate:
        return 1 - label
    return label

df['label'] = df['label'].apply(lambda x: add_label_noise(int(x)))

# ===================== Remove Near-Duplicates =====================
model_embeddings = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model_embeddings.encode(df['text'].tolist(), convert_to_tensor=True, show_progress_bar=True)
cosine_scores = util.cos_sim(embeddings, embeddings)

threshold = 0.9
to_drop = set()
for i in range(len(df)):
    if i in to_drop:
        continue
    similar = torch.where(cosine_scores[i] > threshold)[0].tolist()
    for j in similar:
        if i != j:
            to_drop.add(j)
df = df.drop(df.index[list(to_drop)]).reset_index(drop=True)
print(f"✅ Remaining samples after duplicate removal: {len(df)}")

# ===================== Tokenization =====================
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

# ===================== Metrics =====================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

# ===================== 5-Fold Cross-Validation =====================
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_list, precision_list, recall_list, f1_list = [], [], [], []

num_epochs = 3  # Number of epochs per fold

for fold, (train_idx, val_idx) in enumerate(kfold.split(df)):
    print(f"\n📘 Fold {fold+1}/5")

    train_data = df.iloc[train_idx]
    val_data = df.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_data[['text', 'label']])
    val_dataset = Dataset.from_pandas(val_data[['text', 'label']])

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    train_dataset = train_dataset.remove_columns(['text'])
    val_dataset = val_dataset.remove_columns(['text'])
    train_dataset.set_format('torch')
    val_dataset.set_format('torch')

    # Initialize BERT
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    # Freeze first 8 layers to speed up training
    for name, param in model.bert.encoder.layer[:8].named_parameters():
        param.requires_grad = False

    training_args = TrainingArguments(
        output_dir=f'/content/drive/MyDrive/data2/bert_fold_{fold+1}',
        num_train_epochs=1,  # Train one epoch at a time manually
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.1,
        logging_dir='./logs',
        logging_steps=100,
        save_strategy="epoch",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train manually for each epoch to print metrics
    for epoch in range(num_epochs):
        print(f"\n➡️ Fold {fold+1} - Epoch {epoch+1}/{num_epochs}")
        trainer.train(resume_from_checkpoint=False)
        eval_metrics = trainer.evaluate()
        print(f"Metrics after Epoch {epoch+1}: "
              f"Accuracy={eval_metrics['eval_accuracy']:.4f}, "
              f"Precision={eval_metrics['eval_precision']:.4f}, "
              f"Recall={eval_metrics['eval_recall']:.4f}, "
              f"F1={eval_metrics['eval_f1']:.4f}")

        # Save last epoch metrics for summary
        if epoch == num_epochs - 1:
            accuracy_list.append(eval_metrics['eval_accuracy'])
            precision_list.append(eval_metrics['eval_precision'])
            recall_list.append(eval_metrics['eval_recall'])
            f1_list.append(eval_metrics['eval_f1'])

print("\n✅ Cross-Validation Results (last epoch of each fold):")
print(f"Average Accuracy:  {np.mean(accuracy_list):.4f}")
print(f"Average Precision: {np.mean(precision_list):.4f}")
print(f"Average Recall:    {np.mean(recall_list):.4f}")
print(f"Average F1:        {np.mean(f1_list):.4f}")

# ===================== Final Model Training =====================
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(['text'])
val_dataset = val_dataset.remove_columns(['text'])
train_dataset.set_format('torch')
val_dataset.set_format('torch')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
for name, param in model.bert.encoder.layer[:8].named_parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/data2/bert_final_model',
    num_train_epochs=1,  # manual loop
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("\n📘 Final Model Training")
num_epochs_final = 3
for epoch in range(num_epochs_final):
    print(f"\n➡️ Final Model - Epoch {epoch+1}/{num_epochs_final}")
    trainer.train(resume_from_checkpoint=False)
    eval_metrics = trainer.evaluate()
    print(f"Metrics after Epoch {epoch+1}: "
          f"Accuracy={eval_metrics['eval_accuracy']:.4f}, "
          f"Precision={eval_metrics['eval_precision']:.4f}, "
          f"Recall={eval_metrics['eval_recall']:.4f}, "
          f"F1={eval_metrics['eval_f1']:.4f}")

# ===================== Save Model =====================
save_path = '/content/drive/MyDrive/data2/bert_model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"✅ Model saved to {save_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ File found: /content/drive/MyDrive/data2/combineddataset.xlsx


Batches:   0%|          | 0/122 [00:00<?, ?it/s]

✅ Remaining samples after duplicate removal: 3431

📘 Fold 1/5


Map:   0%|          | 0/2744 [00:00<?, ? examples/s]

Map:   0%|          | 0/687 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



➡️ Fold 1 - Epoch 1/3


Step,Training Loss
100,0.5268
200,0.2005
300,0.2061


Metrics after Epoch 1: Accuracy=0.9461, Precision=0.9398, Recall=0.9483, F1=0.9440

➡️ Fold 1 - Epoch 2/3


Step,Training Loss
100,0.2497
200,0.1896
300,0.2009


Metrics after Epoch 2: Accuracy=0.9461, Precision=0.9398, Recall=0.9483, F1=0.9440

➡️ Fold 1 - Epoch 3/3


Step,Training Loss
100,0.2376
200,0.172
300,0.1954


Metrics after Epoch 3: Accuracy=0.9461, Precision=0.9424, Recall=0.9453, F1=0.9439

📘 Fold 2/5


Map:   0%|          | 0/2745 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



➡️ Fold 2 - Epoch 1/3


Step,Training Loss
100,0.5125
200,0.1851
300,0.2362


Metrics after Epoch 1: Accuracy=0.9417, Precision=0.9383, Recall=0.9383, F1=0.9383

➡️ Fold 2 - Epoch 2/3


Step,Training Loss
100,0.1952
200,0.1777
300,0.2139


Metrics after Epoch 2: Accuracy=0.9431, Precision=0.9412, Recall=0.9383, F1=0.9397

➡️ Fold 2 - Epoch 3/3


Step,Training Loss
100,0.1921
200,0.1641
300,0.2052


Metrics after Epoch 3: Accuracy=0.9431, Precision=0.9412, Recall=0.9383, F1=0.9397

📘 Fold 3/5


Map:   0%|          | 0/2745 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



➡️ Fold 3 - Epoch 1/3


Step,Training Loss
100,0.5083
200,0.1842
300,0.2582


Metrics after Epoch 1: Accuracy=0.9577, Precision=0.9700, Recall=0.9357, F1=0.9525

➡️ Fold 3 - Epoch 2/3


Step,Training Loss
100,0.2203
200,0.1817
300,0.2297


Metrics after Epoch 2: Accuracy=0.9577, Precision=0.9700, Recall=0.9357, F1=0.9525

➡️ Fold 3 - Epoch 3/3


Step,Training Loss
100,0.2186
200,0.1766
300,0.2153


Metrics after Epoch 3: Accuracy=0.9592, Precision=0.9670, Recall=0.9421, F1=0.9544

📘 Fold 4/5


Map:   0%|          | 0/2745 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



➡️ Fold 4 - Epoch 1/3


Step,Training Loss
100,0.5245
200,0.1937
300,0.2674


Metrics after Epoch 1: Accuracy=0.9577, Precision=0.9513, Recall=0.9544, F1=0.9528

➡️ Fold 4 - Epoch 2/3


Step,Training Loss
100,0.2046
200,0.1921
300,0.253


Metrics after Epoch 2: Accuracy=0.9577, Precision=0.9513, Recall=0.9544, F1=0.9528

➡️ Fold 4 - Epoch 3/3


Step,Training Loss
100,0.2
200,0.1815
300,0.2233


Metrics after Epoch 3: Accuracy=0.9563, Precision=0.9511, Recall=0.9511, F1=0.9511

📘 Fold 5/5


Map:   0%|          | 0/2745 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



➡️ Fold 5 - Epoch 1/3


Step,Training Loss
100,0.5205
200,0.1864
300,0.2168


Metrics after Epoch 1: Accuracy=0.9548, Precision=0.9673, Recall=0.9338, F1=0.9502

➡️ Fold 5 - Epoch 2/3


Step,Training Loss
100,0.1989
200,0.1671
300,0.2085


Metrics after Epoch 2: Accuracy=0.9519, Precision=0.9610, Recall=0.9338, F1=0.9472

➡️ Fold 5 - Epoch 3/3


Step,Training Loss
100,0.1993
200,0.163
300,0.1891


Metrics after Epoch 3: Accuracy=0.9504, Precision=0.9609, Recall=0.9306, F1=0.9455

✅ Cross-Validation Results (last epoch of each fold):
Average Accuracy:  0.9510
Average Precision: 0.9525
Average Recall:    0.9415
Average F1:        0.9469


Map:   0%|          | 0/2744 [00:00<?, ? examples/s]

Map:   0%|          | 0/687 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📘 Final Model Training

➡️ Final Model - Epoch 1/3


Step,Training Loss


Metrics after Epoch 1: Accuracy=0.9461, Precision=0.9371, Recall=0.9514, F1=0.9442

➡️ Final Model - Epoch 2/3


Step,Training Loss


Metrics after Epoch 2: Accuracy=0.9447, Precision=0.9422, Recall=0.9422, F1=0.9422

➡️ Final Model - Epoch 3/3


Step,Training Loss


Metrics after Epoch 3: Accuracy=0.9461, Precision=0.9371, Recall=0.9514, F1=0.9442
✅ Model saved to /content/drive/MyDrive/data2/bert_model


In [None]:
# ==============================
#  Install required libraries
# ==============================
!pip install transformers torch

# ==============================
#  Import libraries
# ==============================
import re
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb
from google.colab import drive
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# ==============================
#  Mount Google Drive
# ==============================
drive.mount('/content/drive')

# ==============================
#  Verify model folder
# ==============================
!ls "/content/drive/MyDrive/data2/bert_model/"

# ==============================
#  Load the trained BERT model and tokenizer
# ==============================
model_path = '/content/drive/MyDrive/data2/bert_model'
try:
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    print("✅ BERT model and tokenizer loaded successfully!")
except Exception as e:
    print(f"❌ Error: Failed to load model or tokenizer from {model_path}. Please check the path.")
    print("To locate the model folder, run: !find '/content/drive/MyDrive/' -name 'bert_model'")
    raise e

# ==============================
#  Text preprocessing (same as training)
# ==============================
def preprocess_text(text):
    if isinstance(text, str):
        # Remove non-ASCII characters
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)
        # Collapse multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()
        # Limit to first 200 words
        words = text.split()[:200]
        return ' '.join(words)
    return ''

# ==============================
#  Prediction function
# ==============================
def predict_email(text, subject=""):
    # Combine subject + body
    full_text = preprocess_text(subject + ' ' + text)

    # Tokenize
    inputs = tokenizer(
        full_text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    )

    # Move model and tensors to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).cpu().numpy()[0]
    prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]

    label = "Phishing" if prediction == 1 else "Legitimate"
    confidence = probabilities[prediction]

    return label, confidence, probabilities

# ==============================
#  Manual testing loop
# ==============================
print("\n📧 Manual Email Testing (press Enter on body to stop):")
while True:
    subject = input("Enter email subject (or leave blank): ")
    body = input("Enter email body: ")
    if not body.strip():
        print("🛑 No email body provided. Stopping manual testing.")
        break

    label, confidence, probabilities = predict_email(body, subject)

    print("\n===============================")
    print(f"Subject: {subject[:60] if subject else '(no subject)'}")
    print(f"Body: {body[:60]}...")
    print(f"Prediction: {label}")
    print(f"Confidence: {confidence:.4f}")
    print(f"Probabilities → Phishing: {probabilities[1]:.4f}, Legitimate: {probabilities[0]:.4f}")
    print("===============================")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
config.json	   special_tokens_map.json  vocab.txt
model.safetensors  tokenizer_config.json
✅ BERT model and tokenizer loaded successfully!

📧 Manual Email Testing (press Enter on body to stop):

Subject: Dear Students,
Body: The lecture videos for Week 12 have been uploaded for the co...
Prediction: Legitimate
Confidence: 0.8958
Probabilities → Phishing: 0.1042, Legitimate: 0.8958

Subject: Dear, nandanaanandp@gmail.com!
Body: Dear, nandanaanandp@gmail.com!  We are pleased to inform you...
Prediction: Phishing
Confidence: 0.9908
Probabilities → Phishing: 0.9908, Legitimate: 0.0092

Subject: PayPal account unusual sign-in activity
Body: PayPal account Unusual sign-in activity We detected somethin...
Prediction: Phishing
Confidence: 0.9956
Probabilities → Phishing: 0.9956, Legitimate: 0.0044

Subject: job posting - apple-iss research center
Body: FAX: +27 837 6

In [18]:
import nbformat

path = "/content/drive/MyDrive/Colab Notebooks/bert.ipynb"  # 👈 change if needed

nb = nbformat.read(path, as_version=nbformat.NO_CONVERT)
if 'widgets' in nb['metadata']:
    del nb['metadata']['widgets']
nbformat.write(nb, path)

print("✅ Cleaned metadata in", path)


✅ Cleaned metadata in /content/drive/MyDrive/Colab Notebooks/bert.ipynb


In [None]:
!ls /content/drive/MyDrive/


'Colab Notebooks'				     'model (1).safetensors'
 data2						      Models
 Datasets					      model.safetensors
 events.out.tfevents.1759671799.244238003db4.1231.3
