In [16]:
# basic imports for processing, graphing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet
import pickle
import pkg_resources
#pkg_resources.require('pandas==1.3.5')

# machine learning models
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

# optimiziation checks
import optuna
print("Completed.")

  import pkg_resources


Completed.


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# loading the necessary data
data = pd.read_parquet("Training.parquet")

# see what we have
data.head()
#data.info()

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,https://www.todayshomeowner.com/how-to-make-ho...,82,23,0,2,7,0,0,0,0,...,1,1,0,240,8892,67860,0,1,4,legitimate
1,http://thapthan.ac.th/information/confirmation...,93,14,1,2,0,0,0,0,0,...,1,0,1,0,2996,4189860,0,1,2,phishing
2,http://app.dialoginsight.com/T/OFC4/L2S/3888/B...,121,21,1,3,0,0,0,0,0,...,1,1,0,30,2527,346022,0,1,3,phishing
3,https://www.bedslide.com,24,16,0,2,0,0,0,0,0,...,0,0,0,139,7531,1059151,0,0,4,legitimate
4,https://tabs.ultimate-guitar.com/s/sex_pistols...,73,24,0,3,1,0,0,0,0,...,0,0,0,3002,7590,635,0,1,5,legitimate


In [18]:
# First step is preprocessing the data. URL isn't necessary because we have the characterizations of the others.
urls = data['url']
numeric_data = data.drop(['url'], axis=1)

# Now, we'll encode status (legitimate or phishing) as a boolean (0 or 1).
numeric_data['status'] = numeric_data['status'].replace({'phishing':1, 'legitimate':0})

x = numeric_data.drop('status', axis=1)
y = numeric_data['status']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
# This data has many dimensions. It might be worth performing Principal Component Analysis, but since
# their values don't seem to be particularly related it might not yield much benefit. That thought might
# be revisited later.

In [19]:
# implementing scaler so that we can maintain information while reducing spread
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [20]:
# generating a neural network fitted with our training data.
model = MLPClassifier(hidden_layer_sizes=(15,15,15), max_iter=5000)
model.fit(x_train, y_train.values.ravel())

In [68]:
# test the accuracy of the generated model by generating and comparing the testing data to their verified results.
predictions = model.predict(x_test)
predictions_probability = model.predict_proba(x_test)
print(predictions[3])
print(predictions_probability[3][predictions[3]])
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

1
0.9999999910949474
[[901  50]
 [ 51 913]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       951
           1       0.95      0.95      0.95       964

    accuracy                           0.95      1915
   macro avg       0.95      0.95      0.95      1915
weighted avg       0.95      0.95      0.95      1915



Now we have a strong model to work with. 95% is an acceptable result for now. TODO is hyperparameter tuning with Optuna to create an optimized model, but for now we can use this as a starting spot. Now, we'll export our data.

In [22]:
# serializing the finished model so that it can be saved long-term
with open('phishing-model-v1.pkl', 'wb') as file:
    pickle.dump(model, file)
with open('model-scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [72]:
# handle posting the data directly to CockroachDB
import os
import psycopg2
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv(override=True)

database_url = os.getenv('DATABASE_URL')


engine = create_engine(database_url)

con = engine.connect()

pred_data = pd.DataFrame(predictions, columns=['prediction'])
pred_prob_data = pd.DataFrame(predictions_probability, columns=['probability_legit', 'probability_phishy'])

def get_selected_prob(item):
    return max(item['probability_legit'], item['probability_phishy'])

pred_prob_data['probability'] = pred_prob_data.apply(lambda item: get_selected_prob(item), axis=1)

pred_data['probability'] = pred_prob_data['probability']

pred_data.to_sql('Prediction', con=con, if_exists='replace')


new_data = numeric_data
new_data['url'] = urls
new_data.to_sql("ModelData", con=con, if_exists='replace')
# data.to_sql('ModelData', con=con, if_exists='replace')


142

In [73]:
# testing model accuracy with sequestered data due to confusingly inaccurate responses
# discovered issue was due to scaler transformation not being done on training data. Exporting the trained scaler and
# using it to scale the testing data for testing accuracy. 

unloaded_scaler = pickle.load(open('model-scaler.pkl', 'rb'))
with open('phishing-model-v1.pkl', 'rb') as file:
    model = pickle.load(file)
    data = pd.read_parquet("Testing.circ")
    urls = data['url']
    numeric_data = data.drop(['url'], axis=1)
    numeric_data['status'] = numeric_data['status'].replace({'phishing':1, 'legitimate':0})
    with open('numeric_data.pkl', 'wb') as file:
        pickle.dump(numeric_data, file)
    x = numeric_data.drop('status', axis=1)
    y = numeric_data['status']
    x_trained = unloaded_scaler.transform(x)
    predictions = model.predict(x_trained)
    predictions_probability = model.predict_proba(x_trained)
    print(predictions[1])
    print(predictions_probability[1])
    print(confusion_matrix(y, predictions))
    print(classification_report(y, predictions))

# serializing all files for direct use in API, Vercel doesn't allow large functions and the models go above their guidelines. The code
# that runs it live will remain on the main branch.
with open('predictions.pkl', 'wb') as file:
    pickle.dump(predictions, file)
with open('predictions_probability.pkl', 'wb') as file:
    pickle.dump(predictions_probability, file)
with open('urls.pkl', 'wb') as file:
    pickle.dump(urls, file)

0
[9.99999713e-01 2.86982746e-07]
[[1775  111]
 [ 104 1782]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1886
           1       0.94      0.94      0.94      1886

    accuracy                           0.94      3772
   macro avg       0.94      0.94      0.94      3772
weighted avg       0.94      0.94      0.94      3772



In [None]:
new_data = pd.DataFrame(x_test, columns=data.drop(['status'], axis=1).columns)
new_data['status'] = y_test.reset_index(drop=True)

ValueError: Shape of passed values is (1915, 87), indices imply (1915, 89)