<a href="https://colab.research.google.com/github/Vieira-Marola/Spam_Detection/blob/main/Phishing_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install openml
import openml
import pandas as pd

# Load the dataset using its data_id
dataset = openml.datasets.get_dataset(46099)

# Get the data in a pandas DataFrame format
X, y, _, _ = dataset.get_data(dataset_format="dataframe")

df = pd.concat([X, y], axis=1)

# Display the first few rows of the DataFrame
print("Dataset loaded successfully. Displaying the first 5 rows:")
print(df.head())

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB)
Collecting minio (from openml)
  Downloading minio-7.2.20-py3-none-any.whl.metadata (6.5 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.20-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.8/93.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-1.0.2-py3-none-any.whl (13 kB)
Downloading pycryptodome-3.23

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82486 entries, 0 to 82485
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   text_combined  82486 non-null  object  
 1   label          82486 non-null  category
dtypes: category(1), object(1)
memory usage: 725.2+ KB
None


In [4]:
print(df.describe(include='all'))

                                            text_combined  label
count                                               82486  82486
unique                                              82078      2
top     charity sees need cost dear friend read want f...      1
freq                                                    3  42891


In [5]:
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
text_combined    0
label            0
dtype: int64


In [6]:
df['text_combined'] = df['text_combined'].str.lower()
print("Converted 'text_combined' column to lowercase.")
print(df.head())

Converted 'text_combined' column to lowercase.
                                       text_combined label
0  hpl nom may 25 2001 see attached file hplno 52...     0
1  nom actual vols 24 th forwarded sabrae zajac h...     0
2  enron actuals march 30 april 1 201 estimated a...     0
3  hpl nom may 30 2001 see attached file hplno 53...     0
4  hpl nom june 1 2001 see attached file hplno 60...     0


In [7]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['text_combined'] = df['text_combined'].apply(remove_punctuation)
print("Removed punctuation from 'text_combined' column.")
print(df.head())

Removed punctuation from 'text_combined' column.
                                       text_combined label
0  hpl nom may 25 2001 see attached file hplno 52...     0
1  nom actual vols 24 th forwarded sabrae zajac h...     0
2  enron actuals march 30 april 1 201 estimated a...     0
3  hpl nom may 30 2001 see attached file hplno 53...     0
4  hpl nom june 1 2001 see attached file hplno 60...     0


In [8]:
print("Distribution of the target variable 'label':")
print(df['label'].value_counts())
print("\nPercentage distribution of the target variable 'label':")
print(df['label'].value_counts(normalize=True) * 100)

Distribution of the target variable 'label':
label
1    42891
0    39595
Name: count, dtype: int64

Percentage distribution of the target variable 'label':
label
1    51.997915
0    48.002085
Name: proportion, dtype: float64


In [9]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df['text_combined']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split into training and testing sets.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Data split into training and testing sets.
Shape of X_train: (65988,)
Shape of X_test: (16498,)
Shape of y_train: (65988,)
Shape of y_test: (16498,)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limiting features to manage complexity

# Fit and transform X_train
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform X_test
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF Vectorization completed.")
print(f"Shape of X_train_tfidf: {X_train_tfidf.shape}")
print(f"Shape of X_test_tfidf: {X_test_tfidf.shape}")

TF-IDF Vectorization completed.
Shape of X_train_tfidf: (65988, 5000)
Shape of X_test_tfidf: (16498, 5000)


In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)

# Train the model
model.fit(X_train_tfidf, y_train)

print("Logistic Regression model trained successfully.")

Logistic Regression model trained successfully.


In [12]:
y_pred = model.predict(X_test_tfidf)
print("Predictions on the test set completed.")

Predictions on the test set completed.


In [13]:
from sklearn.metrics import accuracy_score, f1_score

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred, average='weighted') # Using 'weighted' for multi-class/imbalanced binary classification

print(f"Model Accuracy: {accuracy:.4f}")
print(f"Model F1-score: {f1:.4f}")

Model Accuracy: 0.9824
Model F1-score: 0.9824
