In [13]:
import pandas as pd
from difflib import SequenceMatcher
from sklearn.model_selection import train_test_split

try:
    df = pd.read_csv("PS02_Training_set.csv")
except FileNotFoundError:
    print("Error: PS02_Training_set.csv not found. Please upload it to your Colab environment.")
    exit()





In [None]:
import pandas as pd
from difflib import SequenceMatcher
from sklearn.model_selection import train_test_split
from urllib.parse import urlparse
import re
from datetime import datetime, timedelta
import random

def mock_whois_lookup(domain):
    """Mocks WHOIS lookup to get Domain Age for training data."""
    if 'airtel' in domain.lower() or 'sbi' in domain.lower():
        # Phishing/Suspected domains are usually new
        creation_date = datetime.now() - timedelta(days=random.randint(1, 90))
    else:
        # Assume legitimate/older domains for general mocks
        creation_date = datetime.now() - timedelta(days=random.randint(500, 2000))
        
    return {
        "domain_age_days": (datetime.now() - creation_date).days
    }


def extract_all_features(row):
    """
    Calculates all 14 features required by the updated model pipeline 
    (from the new feature_engineer.py logic).
    """
    identified_domain = str(row['Identified Phishing/Suspected Domain Name']).lower()
    cse_domain = str(row['Corresponding CSE Domain Name']).lower()
    
    identified_url = 'https://' + identified_domain 

   
    identified_domain_base = identified_domain.split('.')[0]
    cse_domain_base = cse_domain.split('.')[0]
    
    ratio = SequenceMatcher(None, identified_domain_base, cse_domain_base).ratio()
    len_diff = abs(len(identified_domain_base) - len(cse_domain_base))
    
    
    parsed_url = urlparse(identified_url)
    domain_only = parsed_url.netloc
    path = parsed_url.path
    query = parsed_url.query
    
    url_features = {
        'URL_Length': len(identified_url),
        'Num_Slashes': identified_url.count('/') - 2,
        'Num_Underscores': identified_url.count('_'),
        'Num_Question_Marks': identified_url.count('?'),
        'Num_Equal_Signs': identified_url.count('='),
        'Special_Chars_Count': len(re.findall(r'[!@#$%^&*()|]', identified_url)),
        'Path_Length': len(path),
        'Has_Query': int(bool(query)),
        'Num_Subdomains': len(domain_only.split('.')) - 2, 
        'Domain_Length': len(domain_only),
        'Num_Dots': domain_only.count('.'),
        'Num_Hyphens': domain_only.count('-')
    }
    
   
    whois_data = mock_whois_lookup(identified_domain)
    domain_age = whois_data['domain_age_days']
    
    
    return pd.Series([ratio, len_diff, domain_age, *url_features.values()])

In [None]:


from external_lookups import mock_whois_lookup
# 1. Simulate Full URL (required for new URL-based features)
# NOTE: This is a hack for training data; your app uses the real URL.
df['Simulated_URL'] = 'https://' + df['Identified Phishing/Suspected Domain Name']

# 2. Calculate New Features for all rows
# Re-implementing the logic from feature_engineer.py

def calculate_full_features(row):
    # Use the logic from the updated feature_engineer.py
    identified_url = row['Simulated_URL']
    cse_domain = str(row['Corresponding CSE Domain Name'])
    identified_domain = urlparse(identified_url).netloc
    
   
    ratio = SequenceMatcher(None, identified_domain.split('.')[0], cse_domain.split('.')[0]).ratio()
    len_diff = abs(len(identified_domain.split('.')[0]) - len(cse_domain.split('.')[0]))
    
    
    parsed_url = urlparse(identified_url)
    path = parsed_url.path
    query = parsed_url.query
    
    url_features = {
        'URL_Length': len(identified_url),
        'Num_Slashes': identified_url.count('/') - 2, 
        'Num_Underscores': identified_url.count('_'),
        'Num_Question_Marks': identified_url.count('?'),
        'Num_Equal_Signs': identified_url.count('='),
        'Special_Chars_Count': len(re.findall(r'[!@#$%^&*()|]', identified_url)),
        'Path_Length': len(path),
        'Has_Query': int(bool(query)),
        'Num_Subdomains': len(identified_domain.split('.')) - 2, 
        'Domain_Length': len(identified_domain),
        'Num_Dots': identified_domain.count('.'),
        'Num_Hyphens': identified_domain.count('-')
    }
    
    
    whois_data = mock_whois_lookup(identified_domain)
    url_features['Domain_Age_Days'] = whois_data['domain_age_days']

    return pd.Series([ratio, len_diff, *url_features.values()], 
                     index=['Levenshtein_Ratio', 'Length_Difference', *url_features.keys()])


NEW_FEATURE_NAMES = [
    'Levenshtein_Ratio', 'Length_Difference', 'Domain_Age_Days', 
    'URL_Length', 'Num_Slashes', 'Num_Underscores', 
    'Num_Question_Marks', 'Num_Equal_Signs', 'Special_Chars_Count', 
    'Path_Length', 'Has_Query', 'Num_Subdomains', 'Domain_Length', 
    'Num_Dots', 'Num_Hyphens'
]


df[NEW_FEATURE_NAMES] = df.apply(
    extract_all_features, axis=1
)


label_mapping = {
    'Phishing': 2,
    'Suspected': 1
}
df['Label'] = df['Phishing/Suspected Domains (i.e. Class Label)'].map(label_mapping)

print("--- Feature Engineering Complete with 14 Features ---")
print(f"Data shape after adding features: {df.shape}")
print(df[['Identified Phishing/Suspected Domain Name', 'Levenshtein_Ratio', 'Domain_Age_Days', 'URL_Length', 'Label']].head())


df.to_csv('processed_training_data.csv', index=False)

--- Feature Engineering Complete with 14 Features ---
Data shape after adding features: (1043, 24)
  Identified Phishing/Suspected Domain Name  Levenshtein_Ratio  \
0                       airtel-merchants.in           0.545455   
1                      airtelrecharge.co.in           0.600000   
2                         airtelmerchant.in           0.600000   
3       airtelinternetserviceprovider.co.in           0.342857   
4                           airtelpoint.top           0.705882   

   Domain_Age_Days  URL_Length  Label  
0              5.0        27.0      2  
1             14.0        28.0      2  
2             12.0        25.0      2  
3             13.0        43.0      2  
4             12.0        23.0      2  


In [16]:
df.head()

Unnamed: 0,S. No,Critical Sector Entity Name,Corresponding CSE Domain Name,Identified Phishing/Suspected Domain Name,Phishing/Suspected Domains (i.e. Class Label),Evidence file name,Source of detection,Simulated_URL,Levenshtein_Ratio,Length_Difference,...,Num_Question_Marks,Num_Equal_Signs,Special_Chars_Count,Path_Length,Has_Query,Num_Subdomains,Domain_Length,Num_Dots,Num_Hyphens,Label
0,1,Airtel,airtel.in,airtel-merchants.in,Phishing,airtel-merchants.in.pdf,,https://airtel-merchants.in,0.545455,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0,1.0,2
1,2,Airtel,airtel.in,airtelrecharge.co.in,Phishing,airtelrecharge.co.in.pdf,,https://airtelrecharge.co.in,0.6,8.0,...,0.0,0.0,0.0,0.0,0.0,1.0,20.0,2.0,0.0,2
2,3,Airtel,airtel.in,airtelmerchant.in,Phishing,airtelmerchant.in.pdf,,https://airtelmerchant.in,0.6,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,17.0,1.0,0.0,2
3,4,Airtel,airtel.in,airtelinternetserviceprovider.co.in,Phishing,airtelinternetserviceprovider.co.in.pdf,,https://airtelinternetserviceprovider.co.in,0.342857,23.0,...,0.0,0.0,0.0,0.0,0.0,1.0,35.0,2.0,0.0,2
4,5,Airtel,airtel.in,airtelpoint.top,Phishing,airtelpoint.top.pdf,,https://airtelpoint.top,0.705882,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15.0,1.0,0.0,2


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd


if 'df' not in locals():
    df = pd.read_csv('processed_training_data.csv')



numerical_features = ['Levenshtein_Ratio', 'Length_Difference', 'Domain_Length', 'Num_Dots', 'Num_Hyphens']
categorical_features = ['Critical Sector Entity Name']
target = 'Label'

X = df[numerical_features + categorical_features]
y = df[target]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("\n--- Model Training Results ---")
print(f"Model Accuracy on Test Set: {accuracy_score(y_test, y_pred):.4f}")


print("\nClassification Report (Phishing=2, Suspected=1):")
print(classification_report(y_test, y_pred, target_names=['Suspected (1)', 'Phishing (2)']))


--- Model Training Results ---
Model Accuracy on Test Set: 0.9105

Classification Report (Phishing=2, Suspected=1):
               precision    recall  f1-score   support

Suspected (1)       0.92      0.94      0.93       208
 Phishing (2)       0.88      0.85      0.86       105

     accuracy                           0.91       313
    macro avg       0.90      0.89      0.90       313
 weighted avg       0.91      0.91      0.91       313



## 1. Data Augmentation: Integrating Legitimate Data (Label 0)

This code creates dummy 'Legitimate' entries forexisting CSEs and merges them into the dataset.

In [None]:
import pandas as pd
from difflib import SequenceMatcher

df = pd.read_csv('processed_training_data.csv')


cse_map = df[['Critical Sector Entity Name', 'Corresponding CSE Domain Name']].drop_duplicates()


legitimate_data = []
for index, row in cse_map.iterrows():
    
    for i in range(10):
        legitimate_data.append({
            'Critical Sector Entity Name': row['Critical Sector Entity Name'],
            'Corresponding CSE Domain Name': row['Corresponding CSE Domain Name'],
            'Identified Phishing/Suspected Domain Name': row['Corresponding CSE Domain Name'],
            'Phishing/Suspected Domains (i.e. Class Label)': 'Legitimate'
        })

df_legit = pd.DataFrame(legitimate_data)


columns_to_drop = ['S. No', 'Evidence file name', 'Source of detection', 'Label']
df_combined = pd.concat([df.drop(columns=columns_to_drop, errors='ignore'), df_legit], ignore_index=True)


df_combined[NEW_FEATURE_NAMES] = df_combined.apply(
    extract_all_features, axis=1
)


label_mapping_3class = {
    'Legitimate': 0,
    'Suspected': 1,
    'Phishing': 2
}
df_combined['Label'] = df_combined['Phishing/Suspected Domains (i.e. Class Label)'].map(label_mapping_3class)

print("--- Data Augmentation Complete (Synthetic Legitimate Data Added) ---")
print(f"New total rows: {df_combined.shape[0]}")
print("Label Counts after integration:")
print(df_combined['Label'].value_counts())


df_combined.to_csv('3_class_training_data.csv', index=False)

--- Data Augmentation Complete (Synthetic Legitimate Data Added) ---
New total rows: 1293
Label Counts after integration:
Label
1    692
2    351
0    250
Name: count, dtype: int64


2. Retrain the 3-Class Classifier

Now we retrain the model with the integrated Legitimate data (Label 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd


df = pd.read_csv('3_class_training_data.csv')


numerical_features = [
    'Levenshtein_Ratio', 'Length_Difference', 'Domain_Age_Days', 
    'URL_Length', 'Num_Slashes', 'Num_Underscores', 
    'Num_Question_Marks', 'Num_Equal_Signs', 'Special_Chars_Count', 
    'Path_Length', 'Has_Query', 'Num_Subdomains', 'Domain_Length', 
    'Num_Dots', 'Num_Hyphens'
]
categorical_features = ['Critical Sector Entity Name']
target = 'Label'

X = df[numerical_features + categorical_features]
y = df[target]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
final_model.fit(X_train, y_train)


y_pred = final_model.predict(X_test)

print("\n--- Final 3-Class Model Training Results (14 Features) ---")
print(f"Model Accuracy on Test Set: {accuracy_score(y_test, y_pred):.4f}")

print("\nClassification Report (Legitimate=0, Suspected=1, Phishing=2):")
print(classification_report(y_test, y_pred, target_names=['Legitimate (0)', 'Suspected (1)', 'Phishing (2)']))



--- Final 3-Class Model Training Results (14 Features) ---
Model Accuracy on Test Set: 0.9124

Classification Report (Legitimate=0, Suspected=1, Phishing=2):
                precision    recall  f1-score   support

Legitimate (0)       0.91      1.00      0.96        75
 Suspected (1)       0.91      0.94      0.92       208
  Phishing (2)       0.91      0.80      0.85       105

      accuracy                           0.91       388
     macro avg       0.91      0.91      0.91       388
  weighted avg       0.91      0.91      0.91       388



3. Suspected Monitoring Engine (Conceptual Code)

This code block sets up the structure for continuous monitoring and re-classification of domains initially flagged as Suspected (1).

In [None]:
import time
import random



class SuspectedDomainMonitor:
    def __init__(self, monitor_duration_days=90):
       
        self.monitoring_queue = {}
        self.duration = monitor_duration_days

    def add_to_queue(self, domain, cse_domain):
        """Adds a domain classified as 'Suspected' to the monitoring queue."""
        if domain not in self.monitoring_queue:
            self.monitoring_queue[domain] = {
                'cse': cse_domain,
                'start_time': time.time(),
                'status': 'Monitoring'
            }
            print(f"-> Added {domain} to monitoring queue for {self.duration} days.")

    def check_domain_content(self, domain, cse_domain):
        """
        SIMULATION: In the real world, this function would:
        1. Resolve the DNS and fetch the webpage content/screenshot.
        2. Calculate visual/textual similarity (e.g., SSIM, text hashing).
        3. Check for malicious files/binaries.
        """

        
        if random.random() < 0.05:
            
            malicious_score = 0.95
            return True, malicious_score
        else:
           
            return False, 0.10

    def run_monitoring_cycle(self):
        """Iterates through the queue, performs checks, and re-classifies."""
        print("\n--- Running Daily Monitoring Cycle ---")

        reclassified_alerts = []
        domains_to_remove = []

        for domain, data in list(self.monitoring_queue.items()):

            
            time_elapsed = (time.time() - data['start_time']) / (60 * 60 * 24)
            if time_elapsed >= self.duration:
                domains_to_remove.append(domain)
                continue

            
            is_malicious_now, score = self.check_domain_content(domain, data['cse'])

            if is_malicious_now:
                
                reclassified_alerts.append({
                    'domain': domain,
                    'cse': data['cse'],
                    'new_label': 2, 
                    'reason': f"Lookalike content detected (Similarity: {score:.2f})",
                    'day_of_monitoring': int(time_elapsed) + 1
                })
                domains_to_remove.append(domain) 

       
        for domain in domains_to_remove:
            self.monitoring_queue.pop(domain, None)

        return reclassified_alerts


monitor = SuspectedDomainMonitor(monitor_duration_days=3) 


suspected_domains = df[df['Label'] == 1].head(5).to_dict('records')


for item in suspected_domains:
    monitor.add_to_queue(item['Identified Phishing/Suspected Domain Name'], item['Corresponding CSE Domain Name'])


for day in range(1, 4):
    print(f"\n--- SIMULATION DAY {day} ---")
    alerts = monitor.run_monitoring_cycle()

    if alerts:
        print(f"!!! URGENT ALERTS ({len(alerts)} Detected) !!!")
        for alert in alerts:
            print(f"  ðŸš¨ RE-CLASSIFIED: {alert['domain']} (Target: {alert['cse']})")
            print(f"  -> Reason: {alert['reason']} on Day {alert['day_of_monitoring']}")
            print(f"  -> Action: Trigger Takedown/Reporting (Label 2)")

 
    time.sleep(0.1)

print(f"\nMonitoring Queue remaining: {len(monitor.monitoring_queue)} domains.")

-> Added airtel365.com to monitoring queue for 3 days.
-> Added airtel-merchants.site to monitoring queue for 3 days.
-> Added airtelmerchnat.in to monitoring queue for 3 days.
-> Added airtel.club to monitoring queue for 3 days.
-> Added airtel121co.in to monitoring queue for 3 days.

--- SIMULATION DAY 1 ---

--- Running Daily Monitoring Cycle ---
!!! URGENT ALERTS (1 Detected) !!!
  ðŸš¨ RE-CLASSIFIED: airtelmerchnat.in (Target: airtel.in)
  -> Reason: Lookalike content detected (Similarity: 0.95) on Day 1
  -> Action: Trigger Takedown/Reporting (Label 2)

--- SIMULATION DAY 2 ---

--- Running Daily Monitoring Cycle ---
!!! URGENT ALERTS (2 Detected) !!!
  ðŸš¨ RE-CLASSIFIED: airtel.club (Target: airtel.in)
  -> Reason: Lookalike content detected (Similarity: 0.95) on Day 1
  -> Action: Trigger Takedown/Reporting (Label 2)
  ðŸš¨ RE-CLASSIFIED: airtel121co.in (Target: airtel.in)
  -> Reason: Lookalike content detected (Similarity: 0.95) on Day 1
  -> Action: Trigger Takedown/Reporti