In [75]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, accuracy_score
import joblib

In [76]:
df = pd.read_csv("Phishing.csv")

In [77]:
df.head()

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
0,0,2,12,5.5,8,4.083334,2,15,7,0,...,-1,-1,-1,0.676804,0.860529,-1.0,-1.0,-1.0,-1.0,benign
1,0,3,12,5.0,10,3.583333,3,12,8,2,...,1,0,-1,0.715629,0.776796,0.693127,0.738315,1.0,-1.0,benign
2,2,2,11,4.0,5,4.75,2,16,11,0,...,2,0,1,0.677701,1.0,0.677704,0.916667,0.0,0.898227,benign
3,0,2,7,4.5,7,5.714286,2,15,10,0,...,0,0,-1,0.696067,0.879588,0.818007,0.753585,0.0,-1.0,benign
4,19,2,10,6.0,9,2.25,2,9,5,0,...,5,4,3,0.747202,0.8337,0.655459,0.829535,0.83615,0.823008,benign


In [78]:
df_small = df[['Querylength', "domain_token_count", "URL_Type_obf_Type"]].sample(len(df))

In [79]:
df_small["URL_Type_obf_Type"] = df_small.URL_Type_obf_Type.replace({"benign":0, "phishing":1})

In [80]:
X = df_small.drop("URL_Type_obf_Type", axis=1)
y = df_small.URL_Type_obf_Type

In [81]:
X

Unnamed: 0,Querylength,domain_token_count
6697,0,2
5220,0,2
12128,0,2
13566,0,3
10018,0,3
...,...,...
3621,5,2
10822,4,7
3158,0,2
6741,0,2


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=23)

In [83]:
l = LogisticRegression()
l.fit(X_train, y_train)

In [84]:
df_small.URL_Type_obf_Type.value_counts(normalize=True)

URL_Type_obf_Type
0    0.506345
1    0.493655
Name: proportion, dtype: float64

In [85]:
accuracy_score(y_test, l.predict(X_test)), accuracy_score(y_train, l.predict(X_train))

(0.8158750813272609, 0.8130643455625153)

In [86]:
f1_score(y_test, l.predict(X_test)), f1_score(y_train, l.predict(X_train))

(0.7884902840059791, 0.7895989745467863)

In [87]:
roc_auc_score(y_test, l.predict_proba(X_test)[:,1]), roc_auc_score(y_train, l.predict_proba(X_train)[:,1])

(0.8329452174296014, 0.8349283018628173)

In [88]:
confusion_matrix(y_test, l.predict(X_test))

array([[1453,  115],
       [ 451, 1055]])

In [89]:
import pickle
pickle.dump(l, open("model_small.pkl", "wb"))

In [90]:
# Write a function to extract from a url:

from urllib.parse import urlparse, parse_qs

def extract_features(url):
    parsed_url = urlparse(url)

    # Extract query params
    query_length = len(parse_qs(parsed_url.query))

    # Extract domain
    domain_tokens = parsed_url.netloc.split(".")
    domain_token_count = len(domain_tokens)

    return [query_length, domain_token_count]

In [91]:
results = extract_features("http://www.cultivateyourlife.com/data/santander.co.uk/retail.php?http://www.santander.co.uk/csgs/Satellite?appID=abbey.internet.Abbeycom&amp")

In [92]:
results_df = pd.DataFrame([results], columns=["Querylength", "domain_token_count"])

In [93]:
new_pred = l.predict(results_df)

In [94]:
new_pred

array([1])

In [95]:
result_2 = extract_features("https://www.google.com/search?q=hello+world&amp")

In [96]:
result_2_df = pd.DataFrame([result_2], columns=["Querylength", "domain_token_count"])

In [97]:
new_pred_2 = l.predict(result_2_df)

In [98]:
new_pred_2

array([1])

In [165]:
phishing_df = pd.read_csv("phishing_dataset.csv")

In [178]:
phishing_df.to_csv('new_phishing.csv')

In [185]:
phishing_df = phishing_df.rename(columns={phishing_df.columns[0]: 'URLs'})

In [193]:
phishing_df.to_csv('new_phishing.csv', index=False)

In [195]:
_df = pd.read_csv('new_phishing.csv') ; _df

Unnamed: 0,URLs
0,http://bid.openx.net/json?amp;amp;amp;amp;cid;...
1,http://webmail2.centurytel.net/hwebmail/servic...
2,http://www.google.com.ng/imgres?imgurl=http://...
3,http://webmail2.centurytel.net/hwebmail/servic...
4,http://www.liceonuzzi.it/cmd=_Inf/connectionSt...
...,...
9959,http://highedgesolar.com/nw/includes/
9960,http://jimfangimporters.yolasite.com/
9961,http://liuheng.chengxuren.com/Images/
9962,http://mcnaotempreco.net/site/portal/


In [172]:
def train_multiple(df: pd.DataFrame):

    input_df = df.copy()
    features = []
    for url in df["URLs"]:
        feature_extracted = extract_features(url)
        features.append(feature_extracted)

    new_df = pd.DataFrame(features, columns=["Querylength", "domain_token_count"])

    pred = l.predict(new_df)

    input_df["Prediction"] = pred

    return input_df

    

In [173]:
feature_extracted = train_multiple(phishing_df)

In [174]:
feature_extracted

Unnamed: 0,URLs,Prediction
0,http://bid.openx.net/json?amp;amp;amp;amp;cid;...,1
1,http://webmail2.centurytel.net/hwebmail/servic...,1
2,http://www.google.com.ng/imgres?imgurl=http://...,1
3,http://webmail2.centurytel.net/hwebmail/servic...,1
4,http://www.liceonuzzi.it/cmd=_Inf/connectionSt...,1
...,...,...
9959,http://highedgesolar.com/nw/includes/,0
9960,http://jimfangimporters.yolasite.com/,1
9961,http://liuheng.chengxuren.com/Images/,1
9962,http://mcnaotempreco.net/site/portal/,0


In [175]:
feature_extracted.Prediction.value_counts()

Prediction
1    7594
0    2370
Name: count, dtype: int64

In [176]:
phishing_df

Unnamed: 0,URLs
0,http://bid.openx.net/json?amp;amp;amp;amp;cid;...
1,http://webmail2.centurytel.net/hwebmail/servic...
2,http://www.google.com.ng/imgres?imgurl=http://...
3,http://webmail2.centurytel.net/hwebmail/servic...
4,http://www.liceonuzzi.it/cmd=_Inf/connectionSt...
...,...
9959,http://highedgesolar.com/nw/includes/
9960,http://jimfangimporters.yolasite.com/
9961,http://liuheng.chengxuren.com/Images/
9962,http://mcnaotempreco.net/site/portal/


In [183]:
df = pd.read_csv("Phishing.csv")
# col_length = True if df.shape[1] == 1 else False
if not df.shape[1] == 1:
    print("CSV has more than one column")

CSV has more than one column


In [184]:
import streamlit as st

uploaded_file = st.sidebar.file_uploader('Upload your CSV file here.', type=['csv'])




2023-06-16 15:42:11.271 
  command:

    streamlit run /home/mubarak/Phishing/env/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


NoneType

In [152]:
result_2_df = pd.DataFrame(feature_extracted, columns=["Querylength", "domain_token_count"])

In [153]:
result_2_df

Unnamed: 0,Querylength,domain_token_count
0,6,3
1,1,3
2,4,4
3,1,3
4,2,3
...,...,...
9959,0,2
9960,0,3
9961,0,3
9962,0,2


In [None]:
#Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
sfs = SequentialFeatureSelector(LogisticRegression(),
                                k_features="best",
                                forward=True,
                                scoring='accuracy',
                                cv=5,
                                )
selected_features = sfs.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
selected_features

SequentialFeatureSelector(estimator=LogisticRegression(), k_features=(1, 2),
                          scoring='accuracy')