In [8]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import pickle

# Read the CSV file and remove duplicates
df = pd.read_csv("phishing_site_urls.csv")
df.drop_duplicates(subset='URL', inplace=True)

# Tokenize and lemmatize the URLs
tokenizer = RegexpTokenizer(r'[A-Za-z0-9]+')
lemmatizer = WordNetLemmatizer()

def process_url(url):
    tokens = tokenizer.tokenize(str(url))
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

df['processed_url'] = df['URL'].apply(process_url)

# Convert text to feature vectors
word_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=1000, binary=True)
X = word_vectorizer.fit_transform(df['processed_url'])

# Convert labels to numerical values
y = np.where(df['Label'] == 'bad', 0, 1)

# Split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearSVC(dual=False).fit(X_train, y_train)

# Save the trained model and vectorizer
with open('url_model.pkl', 'wb') as f:
    pickle.dump((model, word_vectorizer), f)