In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [45]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


In [46]:
data

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0
...,...,...
995,So what's the latest? It sounds contradictory ...,0
996,"TRANSFER OF 36,759,000.00 MILLION POUNDS TO YO...",1
997,Barb I will call to explain. Are you back in t...,0
998,Yang on travelNot free tonite.May work tomorrow,0


### Let's divide the training and test set into two partitions

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
X = data['text']# features (mensagens)
y = data['label']# target (spam ou ham)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)

## Data Preprocessing

In [49]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [51]:
import re
from bs4 import BeautifulSoup


In [52]:
def clean_html(raw_html):
    raw_html = re.sub(r'<(script|style).*?>.*?</\1>', '', raw_html, flags=re.DOTALL | re.IGNORECASE)
    raw_html = re.sub(r'<!--.*?-->', '', raw_html, flags=re.DOTALL)
    clean_text = re.sub(r'<.*?>', '', raw_html)

    return clean_text

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [53]:
def full_clean(text):
    text = clean_html(text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'^[a-zA-Z]\s+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^b\s+', '', text)
    text = text.lower().strip()

    return text

## Now let's work on removing stopwords
Remove the stopwords.

In [54]:
data['clean_text'] = data['text'].apply(full_clean)


In [55]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    return ' '.join(filtered)

In [56]:
data['clean_text'] = data['clean_text'].apply(remove_stopwords)

In [57]:
data[['text', 'clean_text']].head(10)

Unnamed: 0,text,clean_text
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",dear sir strictly private business proposal mi...
1,Will do.,
2,Nora--Cheryl has emailed dozens of memos about...,nora cheryl emailed dozens memos haiti weekend...
3,Dear Sir=2FMadam=2C I know that this proposal ...,dear sir fmadam know proposal might surprise e...
4,fyi,fyi
5,sure -- bottom line - you need a special secur...,sure bottom line need special security code ge...
6,"Dear Sir,I am Engr. Ugo Nzego with the Enginee...",dear sir engr ugo nzego engineering stores dep...
7,Abedin Huma <AbedinH@state.gov>Saturday Novemb...,abedin huma saturday november pmhfw quint fmsi...
8,There is an Oct 16th George Marshall event at ...,oct th george marshall event department tentat...
9,<P>1 25% for you as the account owner <BR>2 65...,account owner colleagues set aside defray inci...


## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [58]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [59]:
def lemmatize_text(text):
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized)

In [60]:
data['clean_text'] = data['clean_text'].apply(lemmatize_text)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [61]:
from collections import Counter

def get_top_words(text_series, n=10):
    all_words = ' '.join(text_series).split()
    word_counts = Counter(all_words)
    return word_counts.most_common(n)

In [66]:
ham_texts = data[data['label'] == 0]['clean_text']
spam_texts = data[data['label'] == 1]['clean_text']

In [67]:
top_ham_words = get_top_words(ham_texts)
top_spam_words = get_top_words(spam_texts)

In [68]:
top_ham_words

[('â', 244),
 ('state', 136),
 ('pm', 127),
 ('would', 107),
 ('ã', 102),
 ('president', 99),
 ('time', 95),
 ('call', 94),
 ('mr', 91),
 ('obama', 84)]

In [69]:
top_spam_words

[('money', 981),
 ('account', 895),
 ('bank', 800),
 ('fund', 781),
 ('u', 730),
 ('transaction', 549),
 ('business', 514),
 ('country', 508),
 ('mr', 485),
 ('nbsp', 475)]

In [65]:
print(data['label'].unique())

[1 0]


## Extra features

In [70]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€","$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x))

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x))

data_train.head()

NameError: name 'data_train' is not defined

## How would you create a Bag of Words with the CountVectorizer method?

## TD-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [74]:
tfidf_vectorizer = TfidfVectorizer()




In [75]:
X_tfidf = tfidf_vectorizer.fit_transform(data['clean_text'])

In [77]:
print("Shape do dataset vetorizado (TF-IDF):", X_tfidf.shape)

Shape do dataset vetorizado (TF-IDF): (1000, 22019)


### Extra Task (optional) - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

Use a MultinimialNB with default parameters.

Your task is to find the **best feature representation**.

You can work with teams of two persons (recommended).

In [None]:
# Your code