# Explore here

In [1]:
import pandas as pd
import numpy as np
import regex as re
from pickle import dump
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import warnings

In [2]:
url = r'https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv'

In [3]:
data = pd.read_csv(url)
df = pd.DataFrame(data)
df

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True
...,...,...
2994,https://www.smartcitiesworld.net/news/news/dee...,False
2995,https://www.youtube.com/watch,True
2996,https://techcrunch.com/2019/07/04/an-optimisti...,False
2997,https://www.technologyreview.com/2019/12/20/13...,False


In [4]:
df.duplicated().sum()

630

In [5]:
com_count = df['url'].str.contains('.com').sum()
com_count

2585

In [5]:
df['is_spam'] = df['is_spam'].astype(int)

In [6]:
df

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1
...,...,...
2994,https://www.smartcitiesworld.net/news/news/dee...,0
2995,https://www.youtube.com/watch,1
2996,https://techcrunch.com/2019/07/04/an-optimisti...,0
2997,https://www.technologyreview.com/2019/12/20/13...,0


In [7]:
dup = df[df.duplicated()]
dup

Unnamed: 0,url,is_spam
60,https://briefingday.us8.list-manage.com/unsubs...,1
61,https://www.hvper.com/,1
62,https://briefingday.com/m/v4n3i4f3,1
64,https://briefingday.com/fan,1
113,https://briefingday.com/fan,1
...,...,...
2971,https://www.cnbc.com/2020/06/29/stock-market-f...,0
2972,https://thehustle.co/account/,1
2973,https://thehustle.co/,1
2979,https://www.bloomberg.com/tosv2.html,1


In [9]:
df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


In [8]:
def preprocess_text(text):

    # Eliminar prefijo
    text = re.sub(r'^(https?://)?(www\.)?', '', text)

    # Eliminar cualquier caracter que no sea una letra (a-z) o un espacio en blanco ( )
    text = re.sub(r'[^a-z ]', " ", text)

    # Eliminar espacios en blanco
    text = re.sub(r'\s+[a-zA-Z]\s+', " ", text)
    text = re.sub(r'\^[a-zA-Z]\s+', " ", text)

    # Reducir espacios en blanco múltiples a uno único
    text = re.sub(r'\s+', " ", text.lower())

    # Eliminar tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

    return text.split()

In [9]:
df['url'] = df['url'].apply(preprocess_text)

In [10]:
df

Unnamed: 0,url,is_spam
0,"[briefingday, us, list, manage, com, unsubscribe]",1
1,"[hvper, com]",1
2,"[briefingday, com, v, i]",1
3,"[briefingday, com, m, commentform]",0
4,"[briefingday, com, fan]",1
...,...,...
2994,"[smartcitiesworld, net, news, news, deepfake, ...",0
2995,"[youtube, com, watch]",1
2996,"[techcrunch, com, an, optimistic, view, of, de...",0
2997,"[technologyreview, com, this, startup, claims,...",0


In [11]:
# Llamamos al lematizador
download('wordnet')
lemmatizer = WordNetLemmatizer()

download('stopwords')
stop_words = stopwords.words('english')

def lemmatize_text(words, lemmatizer = lemmatizer):
    #lematización
    tokens = [lemmatizer.lemmatize(word) for word in words]
    # sacamos stop words
    tokens = [word for word in tokens if word not in stop_words]
    # que se quede con las de largo mayor a (vamos a probar con 2, ya que en urls ay muchos diminutivos)
    #tokens = [word for word in tokens if len(word) > 1]
    return tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
df['url'] = df['url'].apply(lemmatize_text)
df.head()

Unnamed: 0,url,is_spam
0,"[briefingday, u, list, manage, com, unsubscribe]",1
1,"[hvper, com]",1
2,"[briefingday, com, v]",1
3,"[briefingday, com, commentform]",0
4,"[briefingday, com, fan]",1


In [13]:
tokens_list = df['url']
tokens_list = [' '.join(tokens) for tokens in tokens_list]

vectorizer = TfidfVectorizer(max_features = 1500, max_df = 0.80, min_df = 2)
X = vectorizer.fit_transform(tokens_list).toarray()
y = df['is_spam']

X[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [15]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
y_train

1569    1
2229    0
2296    0
1800    0
1273    0
       ..
1638    1
1095    0
1130    1
1294    0
860     0
Name: is_spam, Length: 2399, dtype: int32

In [17]:
model = SVC(kernel = "linear", random_state = 42)

model.fit(X_train, y_train)

In [18]:
y_pred_test = model.predict(X_test)
y_pred_test

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [19]:
y_pred_train = model.predict(X_train)
y_pred_train

array([1, 0, 0, ..., 1, 0, 0])

In [20]:
accuracy_score(y_test, y_pred_test)

0.9233333333333333

In [21]:
accuracy_score(y_train, y_pred_train)

0.9654022509378908

In [22]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

f1_score_test = f1_score(y_test, y_pred_test, average='micro')
f1_score_train = f1_score(y_train, y_pred_train, average='micro')

precision_test = precision_score(y_test, y_pred_test, average='micro')
precision_train = precision_score(y_train, y_pred_train, average='micro')

recall_test = recall_score(y_test, y_pred_test, average='micro')
recall_train = recall_score(y_train, y_pred_train, average='micro')

print("Accuracy Test: ", accuracy_test)
print("F1 score Test: ", f1_score_test)
print("Precision Test: ", precision_test)
print("Recall Test: ", recall_test)

print("Accuracy Train: ", accuracy_train)
print("F1 score Train: ", f1_score_train)
print("Precision Train: ", precision_train)
print("Recall Train: ", recall_train)


print(classification_report(y_test, y_pred_test))

Accuracy Test:  0.9233333333333333
F1 score Test:  0.9233333333333333
Precision Test:  0.9233333333333333
Recall Test:  0.9233333333333333
Accuracy Train:  0.9654022509378908
F1 score Train:  0.9654022509378908
Precision Train:  0.9654022509378908
Recall Train:  0.9654022509378908
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       455
           1       0.89      0.78      0.83       145

    accuracy                           0.92       600
   macro avg       0.91      0.87      0.89       600
weighted avg       0.92      0.92      0.92       600



In [23]:
grid_params = {'kernel': ["linear", "rbf", "poly"],
                'C': [5, 10, 20],
                'decision_function_shape': ['ovo', 'ovr'], # rate to accept the model on each boosting iteration (higher = complexity)], # = loss, error function to minimize, difference pred/reality
                'gamma': ["scale", "auto"], # fraction of samples to be used for each tree
}

In [24]:
param_grid = {'C':[1,10,100,1000],
                'gamma':[1, 0.1, 0.01, 0.001],
                'kernel':['linear','rbf', 'poly, sigmoid'],
                'cache_size': [100, 200, 300, 500],
                'tol': [0.001, 0.0001],
                'decision_function_shape': ['ovo', 'ovr']}


In [27]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs = -1)

In [28]:
#grid_search.fit(X_train, y_train)

In [29]:
#best_params = grid_search.best_params_

In [30]:
#best_params

In [25]:
model1 = SVC( kernel = 'rbf', C = 10, gamma = 'scale', decision_function_shape = 'ovo', random_state=21)

model1.fit(X_train, y_train)

In [26]:
y_pred_test = model1.predict(X_test)
y_pred_test

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [27]:
y_pred_test = model1.predict(X_test)
y_pred_test

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,

In [28]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

f1_score_test = f1_score(y_test, y_pred_test, average='micro')
f1_score_train = f1_score(y_train, y_pred_train, average='micro')

precision_test = precision_score(y_test, y_pred_test, average='micro')
precision_train = precision_score(y_train, y_pred_train, average='micro')

recall_test = recall_score(y_test, y_pred_test, average='micro')
recall_train = recall_score(y_train, y_pred_train, average='micro')

print("Accuracy Test: ", accuracy_test)
print("F1 score Test: ", f1_score_test)
print("Precision Test: ", precision_test)
print("Recall Test: ", recall_test)

print("Accuracy Train: ", accuracy_train)
print("F1 score Train: ", f1_score_train)
print("Precision Train: ", precision_train)
print("Recall Train: ", recall_train)


print(classification_report(y_test, y_pred_test))

Accuracy Test:  0.9516666666666667
F1 score Test:  0.9516666666666667
Precision Test:  0.9516666666666667
Recall Test:  0.9516666666666667
Accuracy Train:  0.9654022509378908
F1 score Train:  0.9654022509378908
Precision Train:  0.9654022509378908
Recall Train:  0.9654022509378908
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       455
           1       0.91      0.89      0.90       145

    accuracy                           0.95       600
   macro avg       0.94      0.93      0.93       600
weighted avg       0.95      0.95      0.95       600



### Resampling

In [29]:
from imblearn.over_sampling import SMOTE
from collections import Counter
oversample = SMOTE(random_state=21)
X_res, y_res = oversample.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=21)
counter_train = Counter(y_train)
print("counter train: ")
print(counter_train)
counter_test = Counter(y_test)
print("counter test:")
print(counter_test)

counter train: 
Counter({1: 1845, 0: 1839})
counter test:
Counter({0: 464, 1: 458})


In [51]:
model2 = SVC( kernel = 'rbf', C = 10, gamma = 'scale', decision_function_shape = 'ovo', random_state=21)

model2.fit(X_train, y_train)

In [37]:
grid_search1 = GridSearchCV(estimator=model2, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs = -1)

In [43]:
grid_search1.fit(X_train, y_train)

NameError: name 'grid_search1' is not defined

In [31]:
grid_search.best_params_

NameError: name 'grid_search' is not defined

In [52]:
y_pred_test = model2.predict(X_test)
y_pred_test

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,

In [53]:
y_pred_train = model2.predict(X_train)
y_pred_train

array([1, 1, 0, ..., 0, 0, 0])

In [54]:
accuracy_score(y_test, y_pred_test)

0.9533622559652929

In [55]:
accuracy_score(y_train, y_pred_train)

0.9720412595005429

In [50]:
# metricas
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

f1_score_test = f1_score(y_test, y_pred_test, average='micro')
f1_score_train = f1_score(y_train, y_pred_train, average='micro')

precision_test = precision_score(y_test, y_pred_test, average='micro')
precision_train = precision_score(y_train, y_pred_train, average='micro')

recall_test = recall_score(y_test, y_pred_test, average='micro')
recall_train = recall_score(y_train, y_pred_train, average='micro')

print("Accuracy Test: ", accuracy_test)
print("F1 score Test: ", f1_score_test)
print("Precision Test: ", precision_test)
print("Recall Test: ", recall_test)

print("Accuracy Train: ", accuracy_train)
print("F1 score Train: ", f1_score_train)
print("Precision Train: ", precision_train)
print("Recall Train: ", recall_train)


print(classification_report(y_test, y_pred_test))

Accuracy Test:  0.9533622559652929
F1 score Test:  0.9533622559652929
Precision Test:  0.9533622559652929
Recall Test:  0.9533622559652929
Accuracy Train:  0.9720412595005429
F1 score Train:  0.9720412595005429
Precision Train:  0.9720412595005429
Recall Train:  0.9720412595005429
              precision    recall  f1-score   support

           0       0.99      0.92      0.95       464
           1       0.92      0.99      0.95       458

    accuracy                           0.95       922
   macro avg       0.96      0.95      0.95       922
weighted avg       0.96      0.95      0.95       922



In [56]:
dump(model, open("nlp_42.sav", "wb"))

In [None]:
# Nuevo texto a predecir
new_message = "https://www.wikipedia.org"

# Aplicar preprocesamiento y lematización
processed_message = preprocess_text(new_message)
lemmatized_message = lemmatize_text(processed_message, lemmatizer)

# Unir tokens en una sola cadena
lemmatized_message = " ".join(lemmatized_message)

# Vectorizar utilizando el vectorizador entrenado
vectorized_message = vectorizer.transform([lemmatized_message]).toarray()

# Realizar predicción
prediction = model.predict(vectorized_message)

# Mostrar 
print("Predicción:", "Spam" if prediction[0] == 0 else "No Spam")

Predicción: Spam
