In [None]:
import pandas as pd
from difflib import SequenceMatcher
from sklearn.model_selection import train_test_split

try:
    df = pd.read_csv("PS02_Training_set.csv")
except FileNotFoundError:
    print("Error: PS02_Training_set.csv not found. Please upload it to your Colab environment.")
    exit()





In [2]:
# Define a function to calculate Levenshtein Ratio and Length Difference
def calculate_typo_features(row):
    # Standardize and clean the domains
    identified_domain = str(row['Identified Phishing/Suspected Domain Name']).lower()
    cse_domain = str(row['Corresponding CSE Domain Name']).lower()

    # Remove TLDs (e.g., .in, .com) for better comparison of the base name
    # We split by the first dot to get the primary domain name component
    identified_domain_base = identified_domain.split('.')[0]
    cse_domain_base = cse_domain.split('.')[0]

    # Levenshtein Ratio: Higher value indicates higher similarity
    ratio = SequenceMatcher(None, identified_domain_base, cse_domain_base).ratio()

    # Length Difference: Absolute difference in base domain length
    len_diff = abs(len(identified_domain_base) - len(cse_domain_base))

    return ratio, len_diff

In [3]:
# Apply the function to create new features
df[['Levenshtein_Ratio', 'Length_Difference']] = df.apply(
    calculate_typo_features, axis=1, result_type='expand'
)

# 2. Calculate Lexical Features
df['Domain_Length'] = df['Identified Phishing/Suspected Domain Name'].apply(len)
df['Num_Dots'] = df['Identified Phishing/Suspected Domain Name'].apply(lambda x: x.count('.'))
df['Num_Hyphens'] = df['Identified Phishing/Suspected Domain Name'].apply(lambda x: x.count('-'))

# 3. Encode the Target Variable
# Phishing (2) is treated as more severe than Suspected (1)
label_mapping = {
    'Phishing': 2,
    'Suspected': 1
}
df['Label'] = df['Phishing/Suspected Domains (i.e. Class Label)'].map(label_mapping)

print("--- Feature Engineering Complete ---")
print(f"Data shape after adding features: {df.shape}")
print(df[['Identified Phishing/Suspected Domain Name', 'Levenshtein_Ratio', 'Domain_Length', 'Label']].head())

# Save the processed data (optional, but good practice)
df.to_csv('processed_training_data.csv', index=False)

--- Feature Engineering Complete ---
Data shape after adding features: (1043, 13)
  Identified Phishing/Suspected Domain Name  Levenshtein_Ratio  Domain_Length  \
0                       airtel-merchants.in           0.545455             19   
1                      airtelrecharge.co.in           0.600000             20   
2                         airtelmerchant.in           0.600000             17   
3       airtelinternetserviceprovider.co.in           0.342857             35   
4                           airtelpoint.top           0.705882             15   

   Label  
0      2  
1      2  
2      2  
3      2  
4      2  


In [4]:
df.head()

Unnamed: 0,S. No,Critical Sector Entity Name,Corresponding CSE Domain Name,Identified Phishing/Suspected Domain Name,Phishing/Suspected Domains (i.e. Class Label),Evidence file name,Source of detection,Levenshtein_Ratio,Length_Difference,Domain_Length,Num_Dots,Num_Hyphens,Label
0,1,Airtel,airtel.in,airtel-merchants.in,Phishing,airtel-merchants.in.pdf,,0.545455,10.0,19,1,1,2
1,2,Airtel,airtel.in,airtelrecharge.co.in,Phishing,airtelrecharge.co.in.pdf,,0.6,8.0,20,2,0,2
2,3,Airtel,airtel.in,airtelmerchant.in,Phishing,airtelmerchant.in.pdf,,0.6,8.0,17,1,0,2
3,4,Airtel,airtel.in,airtelinternetserviceprovider.co.in,Phishing,airtelinternetserviceprovider.co.in.pdf,,0.342857,23.0,35,2,0,2
4,5,Airtel,airtel.in,airtelpoint.top,Phishing,airtelpoint.top.pdf,,0.705882,5.0,15,1,0,2


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# ----------------------------------------------------------------------
# B. Model Training
# ----------------------------------------------------------------------

# Re-load processed data (if running in a new cell)
# If you ran the previous cell, 'df' is already in memory.
if 'df' not in locals():
    df = pd.read_csv('processed_training_data.csv')


# Define features (X) and target (y)
numerical_features = ['Levenshtein_Ratio', 'Length_Difference', 'Domain_Length', 'Num_Dots', 'Num_Hyphens']
categorical_features = ['Critical Sector Entity Name']
target = 'Label'

X = df[numerical_features + categorical_features]
y = df[target]

# Create a preprocessor to handle different feature types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        # One-Hot Encode the categorical CSE name for the model
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

# Define the Model Pipeline: Preprocessing + Classifier (Random Forest)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Split data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("\n--- Model Training Results ---")
print(f"Model Accuracy on Test Set: {accuracy_score(y_test, y_pred):.4f}")

# Phishing (2) is the primary threat, so high precision here is key.
print("\nClassification Report (Phishing=2, Suspected=1):")
print(classification_report(y_test, y_pred, target_names=['Suspected (1)', 'Phishing (2)']))


--- Model Training Results ---
Model Accuracy on Test Set: 0.8978

Classification Report (Phishing=2, Suspected=1):
               precision    recall  f1-score   support

Suspected (1)       0.92      0.93      0.92       208
 Phishing (2)       0.86      0.83      0.84       105

     accuracy                           0.90       313
    macro avg       0.89      0.88      0.88       313
 weighted avg       0.90      0.90      0.90       313



## 1. Data Augmentation: Integrating Legitimate Data (Label 0)

This code creates dummy 'Legitimate' entries forexisting CSEs and merges them into the dataset.

In [6]:
import pandas as pd
from difflib import SequenceMatcher

# Load the processed data with existing features (1043 rows)
df = pd.read_csv('processed_training_data.csv')

# --- SYNTHESIZE LEGITIMATE DATA ---

# 1. Identify all unique CSEs and their corresponding domains
cse_map = df[['Critical Sector Entity Name', 'Corresponding CSE Domain Name']].drop_duplicates()

# 2. Create a new DataFrame for Legitimate data (Label = 0)
legitimate_data = []
for index, row in cse_map.iterrows():
    # Use the genuine CSE domain as the "Identified Domain"
    # Create 10 rows per CSE to simulate a larger set of legitimate checks
    for i in range(10):
        legitimate_data.append({
            'Critical Sector Entity Name': row['Critical Sector Entity Name'],
            'Corresponding CSE Domain Name': row['Corresponding CSE Domain Name'],
            'Identified Phishing/Suspected Domain Name': row['Corresponding CSE Domain Name'],
            'Phishing/Suspected Domains (i.e. Class Label)': 'Legitimate'
        })

df_legit = pd.DataFrame(legitimate_data)

# 3. Combine with original data and apply feature engineering
df_combined = pd.concat([df.drop(columns=['S. No', 'Evidence file name', 'Source of detection', 'Label']), df_legit], ignore_index=True)

# 4. Re-calculate Features for ALL rows (original + synthetic)
def calculate_typo_features_full(row):
    identified_domain = str(row['Identified Phishing/Suspected Domain Name']).lower()
    cse_domain = str(row['Corresponding CSE Domain Name']).lower()

    identified_domain_base = identified_domain.split('.')[0]
    cse_domain_base = cse_domain.split('.')[0]

    ratio = SequenceMatcher(None, identified_domain_base, cse_domain_base).ratio()
    len_diff = abs(len(identified_domain_base) - len(cse_domain_base))

    return ratio, len_diff

df_combined[['Levenshtein_Ratio', 'Length_Difference']] = df_combined.apply(
    calculate_typo_features_full, axis=1, result_type='expand'
)
df_combined['Domain_Length'] = df_combined['Identified Phishing/Suspected Domain Name'].apply(len)
df_combined['Num_Dots'] = df_combined['Identified Phishing/Suspected Domain Name'].apply(lambda x: x.count('.'))
df_combined['Num_Hyphens'] = df_combined['Identified Phishing/Suspected Domain Name'].apply(lambda x: x.count('-'))

# 5. Final 3-Class Label Mapping
label_mapping_3class = {
    'Legitimate': 0,
    'Suspected': 1,
    'Phishing': 2
}
df_combined['Label'] = df_combined['Phishing/Suspected Domains (i.e. Class Label)'].map(label_mapping_3class)

print("--- Data Augmentation Complete (Synthetic Legitimate Data Added) ---")
print(f"New total rows: {df_combined.shape[0]}")
print("Label Counts after integration:")
print(df_combined['Label'].value_counts())

# Save the 3-class data for retraining
df_combined.to_csv('3_class_training_data.csv', index=False)

--- Data Augmentation Complete (Synthetic Legitimate Data Added) ---
New total rows: 1293
Label Counts after integration:
Label
1    692
2    351
0    250
Name: count, dtype: int64


2. Retrain the 3-Class Classifier

Now we retrain the model with the integrated Legitimate data (Label 0)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load the new 3-class dataset
df = pd.read_csv('3_class_training_data.csv')

# Define features and target (same features as before)
numerical_features = ['Levenshtein_Ratio', 'Length_Difference', 'Domain_Length', 'Num_Dots', 'Num_Hyphens']
categorical_features = ['Critical Sector Entity Name']
target = 'Label'

X = df[numerical_features + categorical_features]
y = df[target]

# Create Preprocessor and Model Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

# Split data and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
final_model.fit(X_train, y_train)

# Evaluate the final 3-class model
y_pred = final_model.predict(X_test)

print("\n--- Final 3-Class Model Training Results ---")
print(f"Model Accuracy on Test Set: {accuracy_score(y_test, y_pred):.4f}")

# The classification report is now the crucial check for FALSE POSITIVES (Precision on Label 0)
print("\nClassification Report (Legitimate=0, Suspected=1, Phishing=2):")
print(classification_report(y_test, y_pred, target_names=['Legitimate (0)', 'Suspected (1)', 'Phishing (2)']))


--- Final 3-Class Model Training Results ---
Model Accuracy on Test Set: 0.8969

Classification Report (Legitimate=0, Suspected=1, Phishing=2):
                precision    recall  f1-score   support

Legitimate (0)       0.90      0.96      0.93        75
 Suspected (1)       0.90      0.91      0.91       208
  Phishing (2)       0.88      0.82      0.85       105

      accuracy                           0.90       388
     macro avg       0.89      0.90      0.90       388
  weighted avg       0.90      0.90      0.90       388



3. Suspected Monitoring Engine (Conceptual Code)

This code block sets up the structure for continuous monitoring and re-classification of domains initially flagged as Suspected (1).

In [8]:
import time
import random

# NOTE: For a real project, this would involve Web Scraping libraries (like BeautifulSoup/Selenium)
# and image hashing libraries (like imagehash)

class SuspectedDomainMonitor:
    def __init__(self, monitor_duration_days=90):
        # Dictionary to store domains and their monitoring start time/status
        self.monitoring_queue = {}
        self.duration = monitor_duration_days

    def add_to_queue(self, domain, cse_domain):
        """Adds a domain classified as 'Suspected' to the monitoring queue."""
        if domain not in self.monitoring_queue:
            self.monitoring_queue[domain] = {
                'cse': cse_domain,
                'start_time': time.time(),
                'status': 'Monitoring'
            }
            print(f"-> Added {domain} to monitoring queue for {self.duration} days.")

    def check_domain_content(self, domain, cse_domain):
        """
        SIMULATION: In the real world, this function would:
        1. Resolve the DNS and fetch the webpage content/screenshot.
        2. Calculate visual/textual similarity (e.g., SSIM, text hashing).
        3. Check for malicious files/binaries.
        """

        # --- SIMULATION LOGIC ---
        # Simulate a low chance (5%) of a parked domain becoming active/malicious
        if random.random() < 0.05:
            # High similarity found, indicating content resembling the CSE
            malicious_score = 0.95
            return True, malicious_score
        else:
            # Still parked or hosting non-CSE content
            return False, 0.10

    def run_monitoring_cycle(self):
        """Iterates through the queue, performs checks, and re-classifies."""
        print("\n--- Running Daily Monitoring Cycle ---")

        reclassified_alerts = []
        domains_to_remove = []

        for domain, data in list(self.monitoring_queue.items()):

            # Check duration limit
            time_elapsed = (time.time() - data['start_time']) / (60 * 60 * 24)
            if time_elapsed >= self.duration:
                domains_to_remove.append(domain)
                continue

            # Perform content check
            is_malicious_now, score = self.check_domain_content(domain, data['cse'])

            if is_malicious_now:
                # Trigger Re-classification and Alert
                reclassified_alerts.append({
                    'domain': domain,
                    'cse': data['cse'],
                    'new_label': 2, # Phishing
                    'reason': f"Lookalike content detected (Similarity: {score:.2f})",
                    'day_of_monitoring': int(time_elapsed) + 1
                })
                domains_to_remove.append(domain) # Remove from queue after classification

        # Update queue
        for domain in domains_to_remove:
            self.monitoring_queue.pop(domain, None)

        return reclassified_alerts

# --- DEMONSTRATION ---
monitor = SuspectedDomainMonitor(monitor_duration_days=3) # Short duration for demo

# Get a list of actual Suspected domains from your original data
suspected_domains = df[df['Label'] == 1].head(5).to_dict('records')

# Add them to the queue
for item in suspected_domains:
    monitor.add_to_queue(item['Identified Phishing/Suspected Domain Name'], item['Corresponding CSE Domain Name'])

# Simulate 3 daily checks
for day in range(1, 4):
    print(f"\n--- SIMULATION DAY {day} ---")
    alerts = monitor.run_monitoring_cycle()

    if alerts:
        print(f"!!! URGENT ALERTS ({len(alerts)} Detected) !!!")
        for alert in alerts:
            print(f"  ðŸš¨ RE-CLASSIFIED: {alert['domain']} (Target: {alert['cse']})")
            print(f"  -> Reason: {alert['reason']} on Day {alert['day_of_monitoring']}")
            print(f"  -> Action: Trigger Takedown/Reporting (Label 2)")

    # Pause for simulation purposes
    time.sleep(0.1)

print(f"\nMonitoring Queue remaining: {len(monitor.monitoring_queue)} domains.")

-> Added airtel365.com to monitoring queue for 3 days.
-> Added airtel-merchants.site to monitoring queue for 3 days.
-> Added airtelmerchnat.in to monitoring queue for 3 days.
-> Added airtel.club to monitoring queue for 3 days.
-> Added airtel121co.in to monitoring queue for 3 days.

--- SIMULATION DAY 1 ---

--- Running Daily Monitoring Cycle ---

--- SIMULATION DAY 2 ---

--- Running Daily Monitoring Cycle ---

--- SIMULATION DAY 3 ---

--- Running Daily Monitoring Cycle ---
!!! URGENT ALERTS (1 Detected) !!!
  ðŸš¨ RE-CLASSIFIED: airtel-merchants.site (Target: airtel.in)
  -> Reason: Lookalike content detected (Similarity: 0.95) on Day 1
  -> Action: Trigger Takedown/Reporting (Label 2)

Monitoring Queue remaining: 4 domains.
