In [1]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

In [2]:
import pandas as pd
data = pd.read_csv('phishing.csv')


data.head()


Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [3]:
print("Dataset has rows {} and columns {}".format(*data.shape))

Dataset has rows 11054 and columns 32


In [4]:
#Listing the features of the dataset

data.columns

Index(['Index', 'UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//',
       'PrefixSuffix-', 'SubDomains', 'HTTPS', 'DomainRegLen', 'Favicon',
       'NonStdPort', 'HTTPSDomainURL', 'RequestURL', 'AnchorURL',
       'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 'AbnormalURL',
       'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick',
       'UsingPopupWindow', 'IframeRedirection', 'AgeofDomain', 'DNSRecording',
       'WebsiteTraffic', 'PageRank', 'GoogleIndex', 'LinksPointingToPage',
       'StatsReport', 'class'],
      dtype='object')

In [5]:
# Splitting the dataset into independent (X) and dependent (y) features
X = data.drop(["Index", "class"], axis=1)  # Drop 'Index' and the target column 'class'
y = data["class"]  # Target variable


In [6]:
# Splitting the dataset into train and test sets: 80-20 split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
base_learners = [
    ('gb', GradientBoostingClassifier(learning_rate=0.7, max_depth=4)),
    ('svc', SVC(probability=True)),
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('knn', KNeighborsClassifier()),
    ('mlp', MLPClassifier(max_iter=1000)),
    ('xgb', XGBClassifier(use_label_encoder=False)),
    ('dt', DecisionTreeClassifier()),  # Add Decision Tree
    ('lr', LogisticRegression())
]


In [8]:
meta_learner = MLPClassifier()

In [9]:
stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=10,
    passthrough=False,
    n_jobs=-1
)


In [10]:

import pandas as pd
from sklearn.ensemble import StackingClassifier
from category_encoders import TargetEncoder

# Create a TargetEncoder object
encoder = TargetEncoder()

# Fit the encoder on the training data and transform both training and testing data
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

# Now, use the encoded data for training the model
stacking_model.fit(X_train_encoded, y_train)



In [11]:
X_train_stack = stacking_model.transform(X_train)
X_test_stack = stacking_model.transform(X_test)

In [12]:
mlp_meta = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
mlp_meta.fit(X_train_stack, y_train)
y_pred_mlp_stack = mlp_meta.predict(X_test_stack)

In [13]:
#_pred = stacking_model.predict(X_test_encoded)

In [14]:
results = {
     'Model': 'MLP on Stacking Output',
    'Accuracy': accuracy_score(y_test, y_pred_mlp_stack),
    'F1 Score': f1_score(y_test, y_pred_mlp_stack),
    'Precision': precision_score(y_test, y_pred_mlp_stack),
    'Recall': recall_score(y_test, y_pred_mlp_stack)
}

In [15]:
results_data = pd.DataFrame([results])
print(results_data)

                    Model  Accuracy  F1 Score  Precision    Recall
0  MLP on Stacking Output  0.960199  0.964372   0.964372  0.964372
