
#### Phishing is a method of trying to gather personal information like login credentials or credit card information using deceptive e-mails or  websites.

#### Phishing websites are created to dupe unsuspecting users into thinking they are on a legitimate site. The criminals will spend a lot of time making the site seem as credible as possible and many sites will appear almost indistinguishable from the real thing 

#### So the essence of this code is to determine which sites are used as phishing sites from a given dataset using natural language processing and logistic regression

In [None]:
# Importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import pickle

In [None]:
# Loading the dataset
df = pd.read_csv(r"C:\Users\Olamzkid\Documents\Final Year Project\Dataset\phishing_site_urls Combined.csv")

# Drop rows with NaN values in 'URL' or 'Label'
df = df.dropna(subset=['URL', 'Label'])

# Convert all URLs to strings to avoid TypeError
df['URL'] = df['URL'].astype(str)

# Tokenization
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t) if isinstance(t, str) and t.strip() else [])

# Stemming
stemmer = SnowballStemmer("english")
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

# Joining Stems
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))

# Drop any rows where 'text_sent' is empty after preprocessing
df = df[df['text_sent'].str.strip() != '']

# Data summary and visualization
df.head()
df.info()
df.shape
df.isnull().sum()

# Visualizing the distribution of labels
sns.countplot(x="Label", data=df)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

### About data
#### The dataset was gotten from kaggle.com which is an open source machine learning research center & Phishtank which is a verified source for cybersecurity concerning phishing
#### the 2 data sets combined consists of 629325 rows and 2 columns .The first column consist of links of website and the second column states whether the site is good or bad(phishing)

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(x="Label",data=df)

### PREPROCESSING

#### Now we can Vectorize the URLs.We can gather words from the URLs using Tokenizer
#### Tokenization is the process of breaking down a piece of text into smaller units called tokens that makes it easier for the computer to understand and process so in this instance the tokenizer ignores numbers and special characters, extracting only sequences of alphabetic characters
### RegexpTokenizer
#### we are able to extract the tokens from string by using regular expression with RegexpTokenizer() method.
#### [A-Za-z]+ is used to match sequences of one or more alphabetic characters (both uppercase and lowercase).

In [None]:

# Tokenization
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

# Apply tokenization to the URLs
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t) if isinstance(t, str) and t.strip() else [])

# Stemming
stemmer = SnowballStemmer("english")
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

# Joining Stems
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))


In [None]:
tokenizer.tokenize(df.URL[0]) # this will fetch all the words from the first URL

In [None]:
# Tokenizing all the rows 
print('Getting words tokenized ...')
t0= time.perf_counter()
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
df.sample(5)

### SnowballStemmer
#### Snowball is a small string processing language that gives the root words
#### it is essentially an algorithm for reducing words to their base or root form. In other words it shortens words i.e running to run jumping to jump and makes sure that the new words are treated the same as the old ones

In [None]:
stemmer = SnowballStemmer("english") # choose a language

In [None]:
# Getting all the stemmed words
print('Getting words stemmed ...')
t0= time.perf_counter()
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
df.sample(5)

In [None]:
# Joining all the stemmmed words.
print('Get joiningwords ...')
t0= time.perf_counter()
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
bad_sites = df[df.Label == 'bad']
good_sites = df[df.Label == 'good']

In [None]:
bad_sites.head()

In [None]:
good_sites.head()

In [None]:
df.head()

## Creating Model
### CountVectorizer- Convert a collection of text documents to a matrix of token counts

In [None]:
cv = CountVectorizer()

In [None]:
feature = cv.fit_transform(df.text_sent) #transform all text which we tokenize and stemed

In [None]:
feature[:5].toarray() # convert sparse matrix into array to print transformed features

In [None]:
from sklearn.model_selection import train_test_split


In [None]:

# Feature Extraction with the first vectorizer (used only for exploration)
cv = CountVectorizer()
feature = cv.fit_transform(df.text_sent)

# Train-Test Split
trainX, testX, trainY, testY = train_test_split(feature, df.Label, test_size=0.2, random_state=42)

# Logistic Regression (Initial Exploration)
lr = LogisticRegression()
lr.fit(trainX, trainY)
print('Training Accuracy :', lr.score(trainX, trainY))
print('Testing Accuracy :', lr.score(testX, testY))

# Multinomial Naive Bayes (Initial Exploration)
mnb = MultinomialNB()
mnb.fit(trainX, trainY)
print('Training Accuracy :', mnb.score(trainX, trainY))
print('Testing Accuracy :', mnb.score(testX, testY))


### LogisticRegression
#### Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable i.e (binary classification task) (yes, success, etc.) or 0 (no, failure, etc.)
#### Logistic regression is a statistical model commonly used for , where the outcome is one of two possible classes (e.g., spam or not spam, phishing or not phishing).

When applied to text classification tasks:

Tokenization: This process involves breaking down the text into individual units, such as words or tokens.

Vectorization: After tokenization, the text data is converted into numerical format (vectors) 

Logistic Regression: The logistic regression model then takes these vectors as input features. It learns the relationship between the features (vectorized words) and the binary outcome (e.g., phishing or not phishing).

Thats why im using logistic regression in this particular code

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
lr = LogisticRegression()
lr.fit(trainX,trainY)

In [None]:
lr.score(testX,testY)

#### Logistic Regression provide 96% accuracy,Now we will store the score in the dictionary so that we can find which model performs the best.


In [None]:
Scores_ml = {}
Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)

In [None]:
# Creating and evaluating the confusion matrix
print('Training Accuracy:', lr.score(trainX, trainY))
print('Testing Accuracy:', lr.score(testX, testY))

# Calculate the confusion matrix
con_mat = confusion_matrix(testY, lr.predict(testX))

# Dynamically generate labels based on the unique classes in testY
labels = sorted(set(testY))
predicted_labels = [f'Predicted:{cls}' for cls in labels]
actual_labels = [f'Actual:{cls}' for cls in labels]

# Convert the confusion matrix to a DataFrame
con_mat_df = pd.DataFrame(con_mat, columns=predicted_labels, index=actual_labels)

print('\nCLASSIFICATION REPORT\n')
print(classification_report(testY, lr.predict(testX), target_names=labels))

print('\nCONFUSION MATRIX')
plt.figure(figsize=(6, 4))
sns.heatmap(con_mat_df, annot=True, fmt='d', cmap="YlGnBu")
plt.show()


#### MultinomialNB

#### The multinomial Naive Bayes classifier is for text classification tasks, particularly when dealing with features that represent counts or frequencies of tokens which is exactly what ive been doing since 

#### The naive bayes classifier calculates the probability of the document belonging to each class using the features (word frequencies). It then predicts the class with the highest accuracy.

In [None]:
from sklearn.naive_bayes import MultinomialNB 


In [None]:
# create mnb object
mnb = MultinomialNB()

In [None]:
mnb.fit(trainX,trainY)

In [None]:
mnb.score(testX,testY)

#### MultinomialNB provide 95% accuracy,so we can store the score in the dictionary

In [None]:
Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)

In [None]:
# Evaluate the Multinomial Naive Bayes model
print('Training Accuracy:', mnb.score(trainX, trainY))
print('Testing Accuracy:', mnb.score(testX, testY))

# Calculate the confusion matrix
y_true = testY
y_pred = mnb.predict(testX)
con_mat = confusion_matrix(y_true, y_pred)

# Get the unique classes from both true labels and predictions
labels = sorted(set(y_true) | set(y_pred))  # Union of both sets

# Generate labels for the DataFrame
predicted_labels = [f'Predicted:{cls}' for cls in labels]
actual_labels = [f'Actual:{cls}' for cls in labels]

# Convert the confusion matrix to a DataFrame
con_mat_df = pd.DataFrame(con_mat, columns=predicted_labels, index=actual_labels)

print('\nCLASSIFICATION REPORT\n')
print(classification_report(y_true, y_pred, target_names=labels))

print('\nCONFUSION MATRIX')
plt.figure(figsize=(6, 4))
sns.heatmap(con_mat_df, annot=True, fmt='d', cmap="YlGnBu")
plt.show()


#### So, Logistic Regression is the best fit for this model, Now lets make sklearn pipeline using Logistic Regression

In [None]:

# Tokenization
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

# Apply tokenization to the URLs
df['text_tokenized'] = df.URL.map(lambda t: tokenizer.tokenize(t) if isinstance(t, str) and t.strip() else [])

# Stemming
stemmer = SnowballStemmer("english")
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

# Joining Stems
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))


#### the code above works by :

#### It first tokenizes the input text using the specified regular expression tokenizer (RegexpTokenizer(r'[A-Za-z]+').tokenize).
#### It removes English stopwords.
#### It then converts the text data into a matrix of token counts (using CountVectorizer).
#### Finally, it applies logistic regression (LogisticRegression()) to classify the text based on the features generated by CountVectorizer.

In [None]:
trainX, testX, trainY, testY = train_test_split(df.URL, df.Label)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import RegexpTokenizer

# Define the pipeline with tokenization and Logistic Regression
pipeline_ls = make_pipeline(
    CountVectorizer(tokenizer=RegexpTokenizer(r'[A-Za-z]+').tokenize, stop_words='english'),
    LogisticRegression()
)

# Train-test split using df.URL and df.Label
trainX, testX, trainY, testY = train_test_split(df['URL'], df['Label'], test_size=0.2, random_state=42)

# Train the pipeline
pipeline_ls.fit(trainX, trainY)

# Evaluate the pipeline
print('Training Accuracy:', pipeline_ls.score(trainX, trainY))
print('Testing Accuracy:', pipeline_ls.score(testX, testY))


In [None]:
# Calculate training and testing accuracy
print('Training Accuracy :', pipeline_ls.score(trainX, trainY))
print('Testing Accuracy :', pipeline_ls.score(testX, testY))

# Predict the labels for the test set
predictions = pipeline_ls.predict(testX)

# Calculate confusion matrix
con_mat = confusion_matrix(testY, predictions)

# Determine the number of classes and generate class labels
num_classes = con_mat.shape[0]
class_labels = sorted(set(testY))  # Extract unique classes from true labels

# Verify that class labels match the confusion matrix shape
if num_classes != len(class_labels):
    raise ValueError(f"Expected {len(class_labels)} classes but got {num_classes}.")

# Create DataFrame for the confusion matrix
con_mat_df = pd.DataFrame(con_mat,
                          columns=[f'Predicted:{label}' for label in class_labels],
                          index=[f'Actual:{label}' for label in class_labels])

# Print classification report
print('\nCLASSIFICATION REPORT\n')
print(classification_report(testY, predictions, target_names=class_labels))

# Print confusion matrix
print('\nCONFUSION MATRIX')
plt.figure(figsize=(8, 6))
sns.heatmap(con_mat_df, annot=True, fmt='d', cmap="YlGnBu")
plt.show()

#### Now we dump the model in pickle format
#### pickle files are a convenient way to store and retrieve Python objects, including machine learning models and data, which helps streamline the development and deployment of machine learning applications

we can use it to store models for future use without having to retain the mode every time

In [None]:
with open('phishingApp.pkl', 'wb') as model_file:
    pickle.dump(pipeline_ls, model_file)



In [None]:
loaded_model = pickle.load(open('phishingApp.pkl', 'rb'))
result = loaded_model.score(testX,testY)
print(result)