# Explore here

In [None]:
!pip install wordcloud unidecode nltk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from pickle import dump
import os
import zipfile
import tensorflow as tf
from pathlib import Path
import shutil
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
response = requests.get(url).content.decode('utf-8')

file_name = '/content/gdrive/MyDrive/Colab Notebooks/Data/url_spam.csv'

with open(file_name, 'w') as temp_file:
    temp_file.writelines(response)

In [None]:
file_name = '/content/gdrive/MyDrive/Colab Notebooks/Data/url_spam.csv'
df = pd.read_csv(file_name)
pd.set_option('display.max_columns', None)
df.head()

In [None]:
def preprocess_url(text):
    # Texto a minúscula
    text = text.lower()
    # Texto sin el http/https/www al principio
    text = re.sub(r"http\S|www\S|https\S+", '', text)
    # Texto sin el .com al final
    text = re.sub(r'.com/', ' ', text)
    # Solo texto
    text = re.sub(r'[^\w]', ' ', text)
    # Eliminación de los espacios adicionales
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenización del texto
    tokens = word_tokenize(text)
    # Eliminación de los acentos
    tokens = [unidecode(token) for token in tokens]
    # Lematización de palabras
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [None]:
# Ejemplo de aplicación de la función de preprocesado para una url
url_example = df.url[6]
print(f'Tweet antes de ser preprocesado:\n {url_example}')
url_example_prepro = preprocess_url(url_example)
print('-'*50)
print(f'Tweet tras ser preprocesado:\n {url_example_prepro}')

In [None]:
# Preprocesar el dataset

df['url_prepro'] = df.url.apply(preprocess_url)

In [None]:
df.url_prepro.duplicated().sum()

In [None]:
# Eliminar las url que tras ser preprocesadas han quedado igual
df.drop_duplicates(subset='url_prepro', inplace=True)

In [None]:
df.is_spam.value_counts(normalize=True)

El dataset está claramente desbalanceado hacia urls que no son spam.

In [None]:
# Creamos las nubes de palabras

spam = df['is_spam'].unique()

for sp in spam:
    text = " ".join(df[df['is_spam'] == sp]['url_prepro'].tolist())
    wordcloud = WordCloud(background_color="white", max_words=50, contour_color="steelblue", collocations=True)
    wordcloud.generate(text)
    # Mostrar la nube de palabras para cada partido
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Nube de palabras - spam: {sp}")
    plt.axis("off")
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.url_prepro, df.is_spam, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
clf = LogisticRegression().fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)

In [None]:
y_pred_train = clf.predict(X_train_vec)
print(classification_report(y_train, y_pred_train))

In [None]:
y_pred_test= clf.predict(X_test_vec)
print(classification_report(y_test, y_pred_test))

Podemos observar con el recall que siempre acierta cuando false, y falla mucho cuando true, debido al desbalanceo del dataset, por lo que pasamos a balancear el dataset

In [None]:
clf2 = LogisticRegression(class_weight="balanced").fit(X_train_vec, y_train)
y_pred = clf2.predict(X_test_vec)

In [None]:
y_pred_train2 = clf2.predict(X_train_vec)
print(classification_report(y_train, y_pred_train2))

In [None]:
y_pred_test2= clf2.predict(X_test_vec)
print(classification_report(y_test, y_pred_test2))

Podemos observar la mejora con el balanceo de la regresión lineal, aunque sigue lejos de ser óptima.

In [None]:
model = SVC(kernel = "rbf", C = 1.0, gamma = 0.5, class_weight="balanced")
model.fit(X_train_vec, y_train)

y_pred_svc = model.predict(X_test_vec)

In [None]:
print(classification_report(y_test, y_pred_svc))

Podemos observar que las métricas mejoran algo con respecto a la regresión lineal.

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.5, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

In [None]:
grid.fit(X_train_vec, y_train)

In [None]:
print(grid.best_params_)

print(grid.best_estimator_)

In [None]:
model = SVC(kernel = "rbf", C = 10, gamma = 0.1, class_weight="balanced")
model.fit(X_train_vec, y_train)

y_pred_svc = model.predict(X_test_vec)

In [None]:
print(classification_report(y_test, y_pred_svc))

Finalmente conseguimos de nuevo una ligera mejora en Recall, consiguiendo el modelo más óptimo hasta la fecha.