-
Notifications
You must be signed in to change notification settings - Fork 1
/
setup.py
137 lines (123 loc) · 5.86 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import classification_model as cm
import data_preprocessing as dp
import data_visualization as dv
import feature_extractor as fe
import pandas as pd
import time as t
from classification_model import classify_url, create_model, evaluate_model, find_best_hyperparameter, load_model, print_performance_metrics
# Define dataset paths
raw_data_filepath = 'datasets/raw_dataset.csv'
transformed_dataset_filepath = 'datasets/transformed_dataset.csv'
train_validation_dataset_filepath = "datasets/train_validation_dataset.csv"
testing_dataset_filepath = "datasets/testing_dataset.csv"
# Structural URL features
structural_features = [
'ip',
'https_token',
# 'punycode',
# 'port',
# 'tld_in_path',
# 'tld_in_subdomain',
# 'abnormal_subdomain',
'prefix_suffix',
'shortening_service',
# 'domain_in_brand',
# 'brand_in_subdomain',
# 'brand_in_path',
'suspicious_tld',
'statistical_report'
]
# Statistical URL features
statistical_features = [
'length_url',
'length_hostname',
'nb_dots',
'nb_hyphens',
# 'nb_at',
'nb_qm',
# 'nb_and',
# 'nb_or',
# 'nb_eq',
# 'nb_underscore',
# 'nb_tilde',
'nb_percent',
'nb_slash',
# 'nb_star',
# 'nb_colon',
# 'nb_comma',
# 'nb_semicolon',
# 'nb_dollar',
# 'nb_space',
'nb_www',
# 'nb_com',
# 'nb_dslash',
# 'http_in_path',
'ratio_digits_url',
'ratio_digits_host',
# 'nb_subdomains',
# 'length_words_raw',
'char_repeat',
# 'shortest_words_raw',
# 'shortest_word_host',
# 'shortest_word_path',
# 'longest_words_raw',
# 'longest_word_host',
# 'longest_word_path',
'avg_words_raw',
'avg_word_host',
'avg_word_path',
'phish_hints'
]
selected_features = structural_features + statistical_features
target_header = 'status'
pos_label = 'legitimate' # Positive label used for the confusion matrix
# ====================================================================================================
# 1. Preprocess the data
# ====================================================================================================
dp.preprocess_data(raw_data_filepath, selected_features, target_header)
# ====================================================================================================
# 2. Visualize the data
# ====================================================================================================
if (len(structural_features) > 0):
dv.visualize_structural_feature_distribution(transformed_dataset_filepath, structural_features, target_header)
dv.visualize_feature_correlation(transformed_dataset_filepath, 'Structural', structural_features, target_header)
if (len(statistical_features) > 0):
dv.visualize_statistical_feature_distribution(transformed_dataset_filepath, statistical_features, target_header)
dv.visualize_feature_correlation(transformed_dataset_filepath, 'Statistical', statistical_features, target_header)
# ====================================================================================================
# # 3. Create an untuned model
# ====================================================================================================
print('untuned model:')
untuned_trained_model = create_model(train_validation_dataset_filepath, target_header, None)
ut_accuracy, ut_precision, ut_recall, ut_f1_score, ut_conf_matrix = evaluate_model(untuned_trained_model, testing_dataset_filepath, target_header, pos_label, 'models/untuned_model_evaluation_results.txt')
print_performance_metrics(ut_accuracy, ut_precision, ut_recall, ut_f1_score, ut_conf_matrix)
# ====================================================================================================
# 4. Find best hyperparameter settings
# ====================================================================================================
best_hp_settings = find_best_hyperparameter(train_validation_dataset_filepath, target_header)
print('best hyperparameter settings:' + str(best_hp_settings) + '\n')
# ====================================================================================================
# 5. Create a tuned model
# ====================================================================================================
print('tuned model:')
tuned_trained_model = create_model(train_validation_dataset_filepath, target_header, best_hp_settings)
t_accuracy, t_precision, t_recall, t_f1_score, t_conf_matrix = evaluate_model(tuned_trained_model, testing_dataset_filepath, target_header, pos_label, 'models/tuned_model_evaluation_results.txt')
print_performance_metrics(t_accuracy, t_precision, t_recall, t_f1_score, t_conf_matrix)
# ====================================================================================================
# 6. Visualize feature importance
# ====================================================================================================
tuned_trained_model = load_model('models/tuned_model.joblib')
dv.visualize_feature_importance(tuned_trained_model, train_validation_dataset_filepath, target_header)
# ====================================================================================================
# 7. Classify random URL
# ====================================================================================================
tuned_trained_model = load_model('models/tuned_model.joblib')
phishing_url = "http://shadetreetechnology.com/V4/validation/a111aedc8ae390eabcfa130e041a10a4" # Retrieved from /datasets/raw_dataset.csv
print('Classifying Phishing URL: ' + phishing_url)
result = classify_url(tuned_trained_model, structural_features, statistical_features, phishing_url)
print('Result: ' + result + '\n')
# ====================================================================================================
# Setup Completion
# ====================================================================================================
print('Setup complete.')
print('End of program.')