In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Load your dataset 
# The dataset should include features and a label indicating phishing or legitimate

#Each sample has 30 website parameters and a class label identifying it as a phishing website or not (1 or -1).

dataset = pd.read_csv('phishing.csv')

In [3]:
dataset.head()

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1


In [4]:
# Separate features (X) and labels (y)
X = dataset.drop('class', axis=1)
y = dataset['class']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Standardize the features (optional, but can be beneficial for some algorithms)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# Create a Decision Tree classifier (you can also try other classifiers like RandomForest)
model = DecisionTreeClassifier()

In [8]:
# model = RandomForestClassifier(n_estimators=100)  # Example with RandomForest

# Train the model
model.fit(X_train, y_train)

In [9]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Training Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')

Training Accuracy: 0.9547715965626413
Confusion Matrix:
[[ 926   50]
 [  50 1185]]
Classification Report:
              precision    recall  f1-score   support

          -1       0.95      0.95      0.95       976
           1       0.96      0.96      0.96      1235

    accuracy                           0.95      2211
   macro avg       0.95      0.95      0.95      2211
weighted avg       0.95      0.95      0.95      2211



In [11]:
# Load new data for prediction 
new_data = pd.read_csv('new_data.csv')

In [12]:
# Separate features for the new data
X_new = scaler.transform(new_data.drop('class', axis=1))

In [13]:
# Make predictions on the new data
new_predictions = model.predict(X_new)

In [14]:
# Interpret the results
for i, prediction in enumerate(new_predictions):
    if prediction == 1:
        print(f"Website {i+1} is predicted to be phishing.")
    elif prediction == -1:
        print(f"Website {i+1} is predicted to be legitimate or not phishing.")

Website 1 is predicted to be legitimate or not phishing.
Website 2 is predicted to be phishing.
Website 3 is predicted to be phishing.
Website 4 is predicted to be legitimate or not phishing.
Website 5 is predicted to be phishing.
Website 6 is predicted to be legitimate or not phishing.
Website 7 is predicted to be phishing.
Website 8 is predicted to be legitimate or not phishing.
Website 9 is predicted to be legitimate or not phishing.
Website 10 is predicted to be phishing.
Website 11 is predicted to be legitimate or not phishing.
Website 12 is predicted to be phishing.
Website 13 is predicted to be legitimate or not phishing.
Website 14 is predicted to be legitimate or not phishing.
Website 15 is predicted to be phishing.
Website 16 is predicted to be legitimate or not phishing.
Website 17 is predicted to be phishing.
Website 18 is predicted to be phishing.
Website 19 is predicted to be legitimate or not phishing.
Website 20 is predicted to be phishing.
Website 21 is predicted to be