In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("/content/Phishing_Legitimate_full.csv")

In [None]:
data.head()

# EDA

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
data.describe().T

In [None]:
data.CLASS_LABEL.value_counts()

In [None]:
data['CLASS_LABEL'].value_counts().plot(kind='bar')
plt.title('Class Label vs Count')
plt.xlabel('Class Label')
plt.ylabel('Count')

In [None]:
data.rename(columns={'CLASS_LABEL':'Label'},inplace=True)

In [None]:
data['Label'].value_counts().plot(kind='pie')

In [None]:
def discrete_univariate_analysis(data, feature):
    total = len(data[feature])
    count = data[feature].nunique()
    plt.figure(figsize=(count+1, 5))
    plt.xticks(rotation = 90, fontsize=15)
    ax = sns.countplot(data=data, x=feature, palette='flare', order = \
        data[feature].value_counts().index.sort_values(ascending = False))
    for p in ax.patches:
        label = "{:.1f}%".format(100*p.get_height()/total)
        x = p.get_x()+p.get_width()/2
        y = p.get_height()
        ax.annotate(label, (x,y), ha="center", va="center", size=12, xytext=(0,5), textcoords="offset points")
    plt.show()

In [None]:
discrete_univariate_analysis(data, 'Label')

In [None]:
data.corr()

In [None]:
def heatmap(data,start_index,end_index):
    new_data=pd.DataFrame(data.iloc[:,start_index:end_index])
    plt.figure(figsize=(10,8))
    sns.heatmap(new_data.corr(),annot=True,fmt='.2f')

In [None]:
heatmap(data,0,10)

In [None]:
heatmap(data,10,20)

In [None]:
heatmap(data,20,30)

In [None]:
heatmap(data,30,40)

In [None]:
heatmap(data,40,50)

In [None]:
x=data.drop(columns=['id','Label'],axis=1)
y=data['Label']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
x_train
x_train.columns

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

We will now use mutual information classifier to find non-linear and linear correlation between the features and "Label"

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
x=data.drop(['id','Label'],axis=1)

In [None]:
y=data['Label']

In [None]:
discrete_features = x.dtypes ==int

In [None]:
mi_scores = mutual_info_classif(x, y, discrete_features=discrete_features)
mi_scores = pd.Series(mi_scores, name='MI Scores', index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width,scores)
    plt.yticks(width,ticks)
    plt.title("Mi Scores")

In [None]:
plt.figure(dpi=100, figsize=(12,12))
plot_mi_scores(mi_scores)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as Rfc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def train_logistic(data, top_n):
    top_n_features= mi_scores.sort_values(ascending=False).head(top_n).index.tolist()
    x=data[top_n_features]
    y= data['Label']

    x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.2, shuffle=True)

    LR = LogisticRegression(max_iter=10000)
    LR.fit(x_train, y_train)

    y_pred= LR.predict(x_test)

    precision=precision_score(y_test, y_pred)
    recall=recall_score(y_test, y_pred)
    f1=f1_score(y_test, y_pred)
    accuracy=accuracy_score(y_test, y_pred)

    return precision, recall,f1,accuracy

In [None]:
arr=[]
for i in range(15,51,1):
    precision,recall,f1,accuracy=train_logistic(data,i)
    print("performance for logistic Model with Top {} features is precision :{}, recall :{}, f1 score :{}, accuracy :{}".format(i,precision,recall,f1,accuracy))
    arr.append([i,precision,recall,f1,accuracy])

In [None]:
df = pd.DataFrame(arr,columns=['num_of_features','precision','recall','f1_score','accuracy'])
df

In [None]:
sns.lineplot(x='num_of_features', y='precision', data=df, label='Precision Score')
sns.lineplot(x='num_of_features', y='recall', data=df, label='Recall Score')
sns.lineplot(x='num_of_features', y='f1_score', data=df, label='F1 Score')
sns.lineplot(x='num_of_features', y='accuracy', data=df, label='Acc Score')

# Random Forest Classification

In [None]:
def train_rfc(data, top_n):
    top_n_features = mi_scores.sort_values(ascending=False).head(top_n).index.tolist()
    X=data[top_n_features]
    у=data['Label']

    x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, shuffle=True)

    rfc= Rfc(n_estimators=500,
             max_depth=32,
             max_features=1.0,
            )

    rfc.fit(x_train, y_train)

    y_pred = rfc.predict(x_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    return precision, recall,f1,accuracy

In [None]:
arr=[]
for i in range(15,51,1):
    precision,recall,f1,accuracy=train_rfc(data,i)
    print("performance for logistic Model with Top {} features is precision :{}, recall :{}, f1 score :{}, accuracy :{}".format(i,precision,recall,f1,accuracy))
    arr.append([i,precision,recall,f1,accuracy])

In [None]:
df = pd.DataFrame(arr,columns=['num_of_features','precision','recall','f1_score','accuracy'])
df

In [None]:
sns.lineplot(x='num_of_features', y='precision', data=df, label='Precision Score')
sns.lineplot(x='num_of_features', y='recall', data=df, label='Recall Score')
sns.lineplot(x='num_of_features', y='f1_score', data=df, label='F1 Score')
sns.lineplot(x='num_of_features', y='accuracy', data=df, label='Acc Score')

In [None]:
top_n_features = mi_scores.sort_values(ascending=False).head(27).index.tolist()
x = data[top_n_features]
y = data['Label']

x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.2, shuffle=True)

rfc= Rfc(n_estimators=500,
         max_depth=32,
         max_features=1.0,
        )

rfc.fit(x_train, y_train)

y_pred = rfc.predict(x_test,)

precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)

print("performance for logistic Model with Top {} features is precision :{}, recall :{}, f1 score :{}, accuracy :{}".format(27,precision,recall,f1,accuracy))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k_near = KNeighborsClassifier(n_neighbors=1)

In [None]:
k_near.fit(x_train,y_train)

Y_pre_test = k_near.predict(x_test)
Y_pre_train = k_near.predict(x_train)

In [None]:
train_accuracy_KNN = (accuracy_score(Y_pre_train, y_train))*100
print('Accuracy for train dataset for K-neariest :%2.f '% train_accuracy_KNN , '%')

test_accuracy_KNN = (accuracy_score(Y_pre_test, y_test))*100
print('Accuracy for test dataset for K-neariest :%2.f '% test_accuracy_KNN , '%')

In [None]:
print(classification_report(y_test, Y_pre_test))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [None]:
gnb.fit(x_train,y_train)

Y_pre_test = gnb.predict(x_test)
Y_pre_train = gnb.predict(x_train)

In [None]:
train_accuracy_NB = (accuracy_score(Y_pre_train, y_train))*100
print('Accuracy for train dataset for Naive Bayes : %.2f ' % train_accuracy_NB ,'%')

test_accuracy_NB = (accuracy_score(Y_pre_test, y_test))*100
print('Accuracy for test dataset for Naive Bayes : %.2f ' % test_accuracy_NB ,'%')

In [None]:
print(classification_report(y_test, Y_pre_test))

In [None]:
import joblib

# Store each model and its F1 score
model_scores = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)
lr.fit(x_train, y_train)
lr_f1 = f1_score(y_test, lr.predict(x_test))
model_scores['LogisticRegression'] = (lr_f1, lr)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=500, max_depth=32, max_features=1.0)
rfc.fit(x_train, y_train)
rfc_f1 = f1_score(y_test, rfc.predict(x_test))
model_scores['RandomForest'] = (rfc_f1, rfc)

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train, y_train)
knn_f1 = f1_score(y_test, knn.predict(x_test))
model_scores['KNN'] = (knn_f1, knn)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_f1 = f1_score(y_test, nb.predict(x_test))
model_scores['NaiveBayes'] = (nb_f1, nb)

# ✅ Select the best model based on F1 score
best_model_name = max(model_scores, key=lambda k: model_scores[k][0])
best_f1_score, best_model = model_scores[best_model_name]

print(f"\n✅ Best Model: {best_model_name} with F1 Score: {best_f1_score:.4f}")
joblib.dump(best_model, "best_phishing_model.pkl")
print("✅ Model saved as 'best_phishing_model.pkl'")


In [78]:
import re
from urllib.parse import urlparse
import pandas as pd
import joblib

# Feature extraction function with specified features
def extract_features_from_url(url):
    parsed = urlparse(url)
    hostname = parsed.hostname or ""
    path = parsed.path or ""
    query = parsed.query or ""

    features = {
        "PctExtHyperlinks": 0,  # Placeholder, requires link analysis
        "PctExtResourceUrls": 0,  # Placeholder, requires resource URL analysis
        "PctNullSelfRedirectHyperlinks": 0,  # Placeholder
        "PctExtNullSelfRedirectHyperlinksRT": 0,  # Placeholder
        "NumNumericChars": sum(c.isdigit() for c in url),
        "FrequentDomainNameMismatch": 0,  # Placeholder
        "ExtMetaScriptLinkRT": 0,  # Placeholder
        "NumDash": url.count('-'),
        "SubmitInfoToEmail": 0,  # Placeholder
        "NumDots": url.count('.'),
        "PathLength": len(path),
        "QueryLength": len(query),
        "PathLevel": path.count('/'),
        "InsecureForms": 0,  # Placeholder
        "UrlLength": len(url),
        "NumSensitiveWords": 0,  # Placeholder
        "NumQueryComponents": len(query.split('&')) if query else 0,
        "PctExtResourceUrlsRT": 0,  # Placeholder
        "IframeOrFrame": 0,  # Placeholder
        "HostnameLength": len(hostname),
        "NumAmpersand": url.count('&'),
        "AbnormalExtFormActionR": 0,  # Placeholder
        "UrlLengthRT": 0,  # Placeholder
        "NumDashInHostname": hostname.count('-'),
        "IpAddress": int(re.match(r'\d+\.\d+\.\d+\.\d+', hostname or "") is not None),
        "AbnormalFormAction": 0,  # Placeholder
        "EmbeddedBrandName": 0  # Placeholder
    }

    return features

# Prediction function
def predict_phishing(url):
    # Extract features
    features = extract_features_from_url(url)

    # Load the trained Random Forest model
    model = joblib.load("/content/best_phishing_model.pkl")

    # Convert features to DataFrame using the exact feature names
    feature_df = pd.DataFrame([features])

    # Predict
    prediction = model.predict(feature_df)[0]
    label = "Phishing" if prediction == 1 else "Legitimate"

    return label

# Test examples including the difficult phishing example
test_urls = [
    "https://www.google.com",  # Genuine
    "https://www.paypal.com/signin",  # Genuine
    "https://www.amazon.com/account",  # Genuine
    "http://secure-login-paypal.com.login.phishingsite.org/update.php?user=123",  # Fake
    "https://www.g00gle.com-login.verify-account.com",  # Fake
    "http://bankofamerica.security-update2025.com/login?verify=now&id=456",  # Fake
    "https://login.accounts.paypal.com-secure.verify-service.co.uk/update/profile?session=abc123&action=verify",  # Fake
    "https://www.microsoft.com/en-us/account",  # Genuine
    "https://www.ebay.com/sell",  # Genuine
    "https://www.apple.com/support",  # Genuine
    "https://secure.update.account-paypal.co.validate-service.net/login?token=x7k9p",  # Fake
    "https://login.bankofamerica.online-security.check.co/login-page.html?ref=secure",  # Fake
    "https://www.faceb00k.com-verify.user.auth.network/profile/setup?step=2&id=789",  # Fake
    "https://www.nationalgeographic.org/expeditions/arctic-2025",  # Genuine
    "https://www.redcross.org.uk/donate/emergency-appeal-2025",  # Genuine
    "https://www.tesla.com.cn/model-y/design-studio",  # Genuine
    "https://auth.userpanel.gitlab.io-secure.backup-sys.net/reset-password?code=xyz789",  # Fake
    "https://secure.vpn-service.nordvpn.com-login.checkpoint.world/verify-email.html?track=secureid",  # Fake
    "https://www.shopify.store-admin.update-qr2025.com/access?token=jkl456&mode=admin"  # Fake
    "https://auth.clientarea.paypal.com.security.verification.global-services.co/login-process?session-id=7k9p2m&verify=step1",  # Fake
    "https://secure.account-management.wellsfargo.com-verify.network.intl/login?auth-token=xyz-123-pqrs&redirect=secure",  # Fake
    "https://api.userportal.netflix.com-premium.support.global/access-control?profile=update&session=abc987&lang=en-us"  # Fake
]


# Run predictions
for url in test_urls:
    try:
        result = predict_phishing(url)
        print(f"URL: {url} → {result}")
    except Exception as e:
        print(f"URL: {url} → Error: {e}")

URL: https://www.google.com → Legitimate
URL: https://www.paypal.com/signin → Legitimate
URL: https://www.amazon.com/account → Legitimate
URL: http://secure-login-paypal.com.login.phishingsite.org/update.php?user=123 → Phishing
URL: https://www.g00gle.com-login.verify-account.com → Phishing
URL: http://bankofamerica.security-update2025.com/login?verify=now&id=456 → Legitimate
URL: https://login.accounts.paypal.com-secure.verify-service.co.uk/update/profile?session=abc123&action=verify → Phishing
URL: https://www.microsoft.com/en-us/account → Phishing
URL: https://www.ebay.com/sell → Legitimate
URL: https://www.apple.com/support → Legitimate
URL: https://secure.update.account-paypal.co.validate-service.net/login?token=x7k9p → Phishing
URL: https://login.bankofamerica.online-security.check.co/login-page.html?ref=secure → Phishing
URL: https://www.faceb00k.com-verify.user.auth.network/profile/setup?step=2&id=789 → Phishing
URL: https://www.nationalgeographic.org/expeditions/arctic-2025 → 