In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import wordninja



In [81]:
pd.set_option('display.max_colwidth', None)  


In [131]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")


In [125]:
df.head(16)

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubscribe,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
5,https://www.brookings.edu/interactives/reopening-america-and-the-world/,False
6,https://www.reuters.com/investigates/special-report/health-coronavirus-britain-pub/,False
7,https://www.theatlantic.com/magazine/archive/2020/07/supermarkets-are-a-miracle/612244/,False
8,https://www.vox.com/2020/6/17/21294680/john-bolton-book-excerpts-trump-ukraine-china,False
9,https://www.theguardian.com/travel/2020/jun/18/end-of-tourism-coronavirus-pandemic-travel-industry,False


In [126]:
df['is_spam'] = df['is_spam'].astype(int)


In [136]:
#!pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
     ---------------------------------------- 0.0/541.6 kB ? eta -:--:--
     ------------------------- ------------ 358.4/541.6 kB 5.5 MB/s eta 0:00:01
     -------------------------------------- 541.6/541.6 kB 4.3 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py): started
  Building wheel for wordninja (setup.py): finished with status 'done'
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541535 sha256=3c28598eaf2c61d78bcb4276390d120f8bc754591670264dbd679f7c84768f2e
  Stored in directory: c:\users\rashid\appdata\local\packages\pythonsoftwarefoundation.python.3.11_qbz5n2kfra8p0\localcache\local\pip\cache\wheels\e6\66\9c\712044a983337f5d44f90abcd244bd4b8ad28ee64750404b50
Successfully built wordninja
Installing collected packages: wo

In [138]:
def preprocess_url(url):
    # Remove "http://" and "www."
    url = re.sub(r'https?://(?:www\.)?', '', url)

    # Remove specific patterns, split words at hyphens and periods
    url = re.sub(r'[/=_#]|com|net|org|io|co|us|www', ' ', url, flags=re.IGNORECASE)
    url = re.sub(r'[.-]+', ' ', url)  # Replace multiple hyphens or periods with a single space

    # Use wordninja to separate CamelCase words
    url = ' '.join(wordninja.split(url))

    # Use regular expression to split the URL at spaces
    url_parts = re.split(r'\s+', url)

    # Tokenization
    tokens = [part.strip() for part in url_parts if part.strip()]

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Reassemble the URL
    preprocessed_url = " ".join(tokens)
    return preprocessed_url


In [139]:
df['url'] = df['url'].apply(preprocess_url)

In [140]:
df['url']

0                                                  briefing day 8 list manage unsubscribe
1                                                                                  hv per
2                                                              briefing day v 4 n 3 4 f 3
3                                                       briefing day n 20200618 ment form
4                                                                        briefing day fan
                                              ...                                        
2994    smart city world news news deep fake technology ed advance auto nomo vehicle 5408
2995                                                                        youtube watch
2996                                     tech crunch 2019 07 04 optimistic view deep fake
2997          technology review 2019 12 20 131462 startup claim deep fake protect privacy
2998                                                         bbc news technology 51018758
Name: url,

In [142]:
X_train, X_test, y_train, y_test = train_test_split(
    df['url'], df['is_spam'], test_size=0.2, random_state=42
)

In [143]:
# SVM
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)

In [151]:
y_train_pred_svm = svm_model.predict(X_train_tfidf)
in_sample_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
in_sample_precision_svm = precision_score(y_train, y_train_pred_svm)
in_sample_recall_svm = recall_score(y_train, y_train_pred_svm)
in_sample_f1_svm = f1_score(y_train, y_train_pred_svm)

y_pred_svm = svm_model.predict(X_test_tfidf)
out_of_sample_accuracy_svm = accuracy_score(y_test, y_pred_svm)
out_of_sample_precision_svm = precision_score(y_test, y_pred_svm)
out_of_sample_recall_svm = recall_score(y_test, y_pred_svm)
out_of_sample_f1_svm = f1_score(y_test, y_pred_svm)

print("Metrics Comparison (SVM):")
print(f"  In-Sample Accuracy: {in_sample_accuracy_svm:.2%} | Out-of-Sample Accuracy: {out_of_sample_accuracy_svm:.2%}")
print(f"  In-Sample Precision: {in_sample_precision_svm:.2%} | Out-of-Sample Precision: {out_of_sample_precision_svm:.2%}")
print(f"  In-Sample Recall: {in_sample_recall_svm:.2%} | Out-of-Sample Recall: {out_of_sample_recall_svm:.2%}")
print(f"  In-Sample F1 Score: {in_sample_f1_svm:.2%} | Out-of-Sample F1 Score: {out_of_sample_f1_svm:.2%}")

Metrics Comparison (SVM):
  In-Sample Accuracy: 98.92% | Out-of-Sample Accuracy: 96.50%
  In-Sample Precision: 99.62% | Out-of-Sample Precision: 99.21%
  In-Sample Recall: 95.64% | Out-of-Sample Recall: 86.21%
  In-Sample F1 Score: 97.59% | Out-of-Sample F1 Score: 92.25%

Classification Report (SVM):
               precision    recall  f1-score   support

       False       0.96      1.00      0.98       455
        True       0.99      0.86      0.92       145

    accuracy                           0.96       600
   macro avg       0.97      0.93      0.95       600
weighted avg       0.97      0.96      0.96       600



In [145]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=3)
grid_search.fit(X_train_tfidf, y_train)

print("Best Parameters:", grid_search.best_params_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV 1/3] END .....C=0.1, gamma=1, kernel=linear;, score=0.856 total time=   1.5s
[CV 2/3] END .....C=0.1, gamma=1, kernel=linear;, score=0.864 total time=   1.5s
[CV 3/3] END .....C=0.1, gamma=1, kernel=linear;, score=0.892 total time=   1.5s
[CV 1/3] END ........C=0.1, gamma=1, kernel=rbf;, score=0.815 total time=   2.1s
[CV 2/3] END ........C=0.1, gamma=1, kernel=rbf;, score=0.830 total time=   2.2s
[CV 3/3] END ........C=0.1, gamma=1, kernel=rbf;, score=0.824 total time=   2.5s
[CV 1/3] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.856 total time=   1.8s
[CV 2/3] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.864 total time=   1.8s
[CV 3/3] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.892 total time=   1.3s
[CV 1/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.770 total time=   1.8s
[CV 2/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.770 total time=   2.1s
[CV 3/3] END ......C=0.1, gamma=0.1, kernel=rbf;

In [None]:
best_svm_model = grid_search.best_estimator_

In [148]:
optimized_model = grid_search.best_estimator_
y_pred = optimized_model.predict(X_test_tfidf)


Accuracy: 0.975
Precision: 0.9577464788732394
Recall: 0.9379310344827586
F1 Score: 0.9477351916376306


In [152]:
y_train_pred = optimized_model.predict(X_train_tfidf)

in_sample_accuracy = accuracy_score(y_train, y_train_pred)
in_sample_precision = precision_score(y_train, y_train_pred)
in_sample_recall = recall_score(y_train, y_train_pred)
in_sample_f1 = f1_score(y_train, y_train_pred)

y_pred = optimized_model.predict(X_test_tfidf)

out_of_sample_accuracy = accuracy_score(y_test, y_pred)
out_of_sample_precision = precision_score(y_test, y_pred)
out_of_sample_recall = recall_score(y_test, y_pred)
out_of_sample_f1 = f1_score(y_test, y_pred)

print("Metrics Comparison (Optimized Model):")
print(f"  In-Sample Accuracy: {in_sample_accuracy:.2%} | Out-of-Sample Accuracy: {out_of_sample_accuracy:.2%}")
print(f"  In-Sample Precision: {in_sample_precision:.2%} | Out-of-Sample Precision: {out_of_sample_precision:.2%}")
print(f"  In-Sample Recall: {in_sample_recall:.2%} | Out-of-Sample Recall: {out_of_sample_recall:.2%}")
print(f"  In-Sample F1 Score: {in_sample_f1:.2%} | Out-of-Sample F1 Score: {out_of_sample_f1:.2%}")



Metrics Comparison (Optimized Model):
  In-Sample Accuracy: 99.96% | Out-of-Sample Accuracy: 97.50%
  In-Sample Precision: 99.82% | Out-of-Sample Precision: 95.77%
  In-Sample Recall: 100.00% | Out-of-Sample Recall: 93.79%
  In-Sample F1 Score: 99.91% | Out-of-Sample F1 Score: 94.77%


In [153]:
best_svm_model = grid_search.best_estimator_
joblib.dump(best_svm_model, 'spam_detection_model.joblib')

['spam_detection_model.joblib']