# Explore here

Sistema de detección de enlaces spam  


Queremos implementar un sistema que sea capaz de detectar automáticamente si una página web contiene spam o no basándonos en su URL.

In [10]:
# librerias 
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib
# Descargar recursos de NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...


Paso 1: Cargar datos

In [11]:
# Your code here
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
df = pd.read_csv(url)

print("Datos cargados:")
print(df.head())
print(df.info())

Datos cargados:
                                                 url  is_spam
0  https://briefingday.us8.list-manage.com/unsubs...     True
1                             https://www.hvper.com/     True
2                 https://briefingday.com/m/v4n3i4f3     True
3   https://briefingday.com/n/20200618/m#commentform    False
4                        https://briefingday.com/fan     True
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 26.5+ KB
None


Paso 2: Preprocesamiento de URLs

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_url(url):
    # Minúsculas
    url = url.lower()
    # Dividir por caracteres no alfanuméricos
    tokens = re.split(r'\W+', url)
    # Filtrar stopwords y tokens vacíos
    tokens = [t for t in tokens if t and t not in stop_words]
    # Lemmatizar
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['clean_url'] = df['url'].apply(preprocess_url)

In [16]:
print(df.columns)


Index(['url', 'is_spam', 'clean_url'], dtype='object')


In [17]:

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_url'], df['is_spam'], test_size=0.2, random_state=42, stratify=df['is_spam']
)

Paso 3: Creamos modelo SVM basico

In [None]:

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Resultados del modelo SVM inicial:")
print(classification_report(y_test, y_pred))

Resultados del modelo SVM inicial:
              precision    recall  f1-score   support

       False       0.96      0.96      0.96       461
        True       0.86      0.88      0.87       139

    accuracy                           0.94       600
   macro avg       0.91      0.92      0.91       600
weighted avg       0.94      0.94      0.94       600



Paso 4: Optimizacion con GridSearch

In [None]:


param_grid = {
    'tfidf__ngram_range': [(1,1),(1,2)],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

grid = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print("Mejores parámetros:", grid.best_params_)

y_pred_best = grid.predict(X_test)
print("Resultados tras optimización:")
print(classification_report(y_test, y_pred_best))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END svm__C=0.1, svm__kernel=linear, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END svm__C=0.1, svm__kernel=linear, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END svm__C=0.1, svm__kernel=linear, tfidf__ngram_range=(1, 1); total time=   0.3s
[CV] END svm__C=0.1, svm__kernel=linear, tfidf__ngram_range=(1, 2); total time=   0.5s
[CV] END svm__C=0.1, svm__kernel=linear, tfidf__ngram_range=(1, 2); total time=   0.4s
[CV] END svm__C=0.1, svm__kernel=linear, tfidf__ngram_range=(1, 2); total time=   0.5s
[CV] END svm__C=0.1, svm__kernel=rbf, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END svm__C=0.1, svm__kernel=rbf, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END svm__C=0.1, svm__kernel=rbf, tfidf__ngram_range=(1, 1); total time=   0.4s
[CV] END svm__C=0.1, svm__kernel=rbf, tfidf__ngram_range=(1, 2); total time=   0.6s
[CV] END svm__C=0.1, svm__kernel=rbf, tfidf__ngram_range=(1, 2); total time=   0.

Paso 5: Guardamos el modelo

In [21]:
import os

# Crear carpeta si no existe
os.makedirs("models", exist_ok=True)

# Guardar el modelo
joblib.dump(grid.best_estimator_, "models/url_spam_svm.pkl")
print("Modelo guardado en models/url_spam_svm.pkl")


Modelo guardado en models/url_spam_svm.pkl
