# URDU_PHISHGUARD_AI_TRAIN

**Developed by Sibghat Ullah**

This notebook prepares and fine-tunes a multilingual BERT model for Urdu phishing message detection. It is Colab-ready and structured for GitHub and NCCS submission.

## 1) Setup Google Drive & Check GPU

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = '/content/drive/MyDrive/Urdu-PhishGuard-AI'
import os
os.makedirs(PROJECT_DIR, exist_ok=True)

import torch
print('Torch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
print('Device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU')

## 2) Install Required Libraries

In [None]:
!pip install -q transformers datasets accelerate sentencepiece
import torch
print('Torch version:', torch.__version__)

## 3) Load & Merge Dataset

In [None]:
import pandas as pd, os

DATA_DIR = os.path.join(PROJECT_DIR, 'data')
os.makedirs(DATA_DIR, exist_ok=True)

base_csv = os.path.join(DATA_DIR, 'phishing_samples_expanded.csv')
extra_csv = os.path.join(DATA_DIR, 'extra_samples.csv')

df_base = pd.read_csv(base_csv) if os.path.exists(base_csv) else pd.DataFrame({'text':[], 'label':[]})
df_extra = pd.read_csv(extra_csv) if os.path.exists(extra_csv) else pd.DataFrame({'text':[], 'label':[]})

df = pd.concat([df_base, df_extra], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
merged_csv = os.path.join(DATA_DIR, 'merged_training.csv')
df.to_csv(merged_csv, index=False)

print('Dataset merged. Total samples:', len(df))
print(df['label'].value_counts())

## 4) Quick Dataset Preview

In [None]:
df['text_length'] = df['text'].astype(str).apply(len)
print(df['text_length'].describe())
df.head()

## 5) Tokenize and Prepare HuggingFace Dataset

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

label_map = {'SAFE':0, 'PHISHING':1}
df['label_num'] = df['label'].map(label_map).fillna(0).astype(int)

ds = Dataset.from_pandas(df[['text','label_num']].rename(columns={'label_num':'label'}))
ds = ds.train_test_split(test_size=0.12, seed=42)

def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

ds = ds.map(tokenize_fn, batched=True)
ds = ds.rename_column('label','labels')
ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
print('Tokenization complete.')

## 6) Fine-tune BERT Model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

training_args = TrainingArguments(
    output_dir=os.path.join(PROJECT_DIR, 'model_output'),
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    compute_metrics=compute_metrics
)

trainer.train()

## 7) Export Trained Model

In [None]:
export_dir = os.path.join(PROJECT_DIR, 'model')
os.makedirs(export_dir, exist_ok=True)
trainer.save_model(export_dir)
tokenizer.save_pretrained(export_dir)
print('Model exported to:', export_dir)

## 8) Quick Model Test

In [None]:
from transformers import pipeline

pipe = pipeline('text-classification', model=export_dir, tokenizer=export_dir, device=0 if torch.cuda.is_available() else -1)

test_samples = [
    'Apka bank account block ho gaya hai, abhi verify karein',
    'Meeting kal 10 bajay SID Labs mein hogi',
    'Click here to win free balance now',
    'Please visit https://uet.edu.pk for details',
]

for t in test_samples:
    print(t, ' -> ', pipe(t)[0])

## ✅ Done — Model ready for Web App Deployment
Copy the `model/` folder into your Flask app's `./model` directory.