In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
dataset = pd.read_csv('data/phishing_site_urls.csv')

In [3]:
dataset.head(3)

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad


In [4]:
dataset['Label'].unique()

array(['bad', 'good'], dtype=object)

In [5]:
dataset['Label'].replace({'good':0, 'bad':1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Label'].replace({'good':0, 'bad':1}, inplace=True)
  dataset['Label'].replace({'good':0, 'bad':1}, inplace=True)


In [6]:
dataset['Label'].unique()

array([1, 0], dtype=int64)

In [7]:
x = dataset['URL']
y = dataset['Label']

In [8]:
x_train ,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [24]:
feature_extraction = TfidfVectorizer(min_df=1)

In [25]:
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

In [26]:
phishing_model = LogisticRegression()

In [27]:
phishing_model.fit(x_train_feature, y_train)

In [28]:
y_pred = phishing_model.predict(x_test_feature)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     78670
           1       0.97      0.87      0.92     31200

    accuracy                           0.96    109870
   macro avg       0.96      0.93      0.94    109870
weighted avg       0.96      0.96      0.96    109870



In [29]:
phishing_model.score(x_train_feature, y_train)*100, phishing_model.score(x_test_feature, y_test)*100

(96.83509452165761, 95.62938017657231)

In [30]:
input = [dataset['URL'][1]]
print(input)
prediction = phishing_model.predict(feature_extraction.transform(input))
if prediction == 0:
    print('safe')
else:
    print('danger')

['www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrcmd=_home-customer&nav=1/loading.php']
danger


In [31]:
dataset['URL'][0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [32]:
dataset['URL'][40]

'stthomasedu.ucoz.ua/microsoft.htm'