In [62]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import pickle

# Read the CSV file
df = pd.read_csv("phishing_site_urls.csv")

# Remove duplicate URLs
df.drop_duplicates(subset='URL', inplace=True)
df.reset_index(drop=True, inplace=True)

# Tokenize and lemmatize the URLs
tokenizer = RegexpTokenizer(r'[A-Za-z0-9]+')
df['clean_url'] = df['URL'].apply(lambda x: tokenizer.tokenize(str(x)))
lemmatizer = WordNetLemmatizer()
df['lem_url'] = df['clean_url'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Convert text to feature vectors
word_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=1000, binary=True)
unigram_data = word_vectorizer.fit_transform(df['lem_url'].apply(lambda x: ' '.join(x)))
vocab = word_vectorizer.get_feature_names_out()
x = pd.DataFrame.sparse.from_spmatrix(unigram_data, columns=vocab)

# Convert labels to numerical values
df['Label'] = np.where(df['Label'] == 'bad', 0, 1)
y = df['Label']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2, shuffle=True)

# Train the LinearSVC model
trained_clf_svc = LinearSVC().fit(x_train, y_train)

# Save the trained model and vectorizer
with open('url_model.pkl', 'wb') as f:
    pickle.dump((trained_clf_svc, word_vectorizer), f)
