In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Example: SMS: SPAM or HAM (Kaggle)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("data/kg_train.csv",encoding='latin-1')

# Reduce the training set to speead up development. 
# Modify for final system
#data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(5964, 2)


In [4]:
# Let's divide the training and test set into two partitions
from sklearn.model_selection import train_test_split

data_train, data_val, label_train, label_val = \
    train_test_split(data, data["label"], test_size=0.3, random_state=5)

In [5]:
data_train.head()
#print(label_train)
# print(label_val.shape)

Unnamed: 0,text,label
5650,Ok,0
213,The current revcon document includes a paragra...,0
2084,fyi,0
4785,Sorry to confirm they are not in.,0
4314,"From Israel RadioBy Shmuel Tal""Prime Minister ...",0


## Data Preprocessing

In [6]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')
# tal vegada WordNet millor
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemma  = WordNetLemmatizer()

print('WN:',wordnet_lemma.lemmatize('sleeping',pos="v"))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']
WN: sleep


In [7]:
data_train.head()

Unnamed: 0,text,label
5650,Ok,0
213,The current revcon document includes a paragra...,0
2084,fyi,0
4785,Sorry to confirm they are not in.,0
4314,"From Israel RadioBy Shmuel Tal""Prime Minister ...",0


In [18]:
import re
import nltk
from nltk.corpus import wordnet

punctuation_pattern = re.compile('[%s]' % re.escape(string.punctuation)) 

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0]#.upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

def clean_text(text):
    processed_feature = text
    
    # Remove inline JavaScript/CSS:
    #processed_feature = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", " ", processed_feature)    
    
    # Remove html comments
    #processed_feature = re.sub(r"(?s)<!--(.*?)-->[\n]?", " ", processed_feature)
    
    # Remove remaining html tags. (Added flag if "<div" is found later on process?)
    #processed_feature = re.sub('<.*?>', ' ', processed_feature)
    
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(processed_feature))

    # remove all single characters
    #processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    #processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    
    # Remove punctuation
    punctuation_pattern.sub(' ', processed_feature)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    
    # Remove encoded symbols like =2E, =3A....
    #processed_feature = re.sub(r'=?\d[a-z]', ' ', processed_feature)

    # Remove stop words and lemmatize
    list_text = processed_feature.split(' ')
    lemmatized_text = []
    for word in list_text:
        clean_word = wordnet_lemma.lemmatize(word, get_wordnet_pos(word))
        if clean_word not in stopwords.words("english"):
            lemmatized_text.append(clean_word)
        
    processed_feature = ' '.join(lemmatized_text)
    
    spam_words = ["urgent", "2c", "2e", "private", "html", "profitable", "land dispute"]
    for spam_word in spam_words:
        processed_feature = re.sub(fr'{spam_word}', f'{spam_word} {spam_word} {spam_word} {spam_word} {spam_word} {spam_word} {spam_word}', processed_feature)
    
    # Remove numbers
    processed_feature = re.sub(r'\d', ' ', processed_feature)
    
    return processed_feature


data_train.loc[:,'preprocessed_text'] = data_train['text'].apply(clean_text)
data_val.loc[:,'preprocessed_text'] = data_val['text'].apply(clean_text)

data_train.head()

Unnamed: 0,text,label,preprocessed_text
5650,Ok,0,ok
213,The current revcon document includes a paragra...,0,current revcon document include paragraph nort...
2084,fyi,0,fyi
4785,Sorry to confirm they are not in.,0,sorry confirm
4314,"From Israel RadioBy Shmuel Tal""Prime Minister ...",0,israel radioby shmuel tal prime minister explo...


## Bag Of Words
Let's get the 10 top words in ham and spam messages

In [19]:
from collections import Counter

data_ham  = data_train[data_train['label'] == 0].copy()
data_spam = data_train[data_train['label'] == 1].copy()

words_data_ham  = data_ham['preprocessed_text']
words_data_spam = data_spam['preprocessed_text']


list_ham_words = []
for sublist in words_data_ham:
    for item in sublist.split():
        list_ham_words.append(item)

list_spam_words = []
for sublist in words_data_spam:
    for item in sublist.split():
        list_spam_words.append(item)
        
c_ham  = Counter(list_ham_words)
c_spam = Counter(list_spam_words)
df_hamwords_top10  = pd.DataFrame(c_ham.most_common(10),  columns=['word', 'count'])
df_spamwords_top10 = pd.DataFrame(c_spam.most_common(10), columns=['word', 'count'])

df_spamwords_top10

Unnamed: 0,word,count
0,e,68222
1,c,38098
2,private,5835
3,urgent,4616
4,money,3827
5,br,3533
6,u,3479
7,account,3366
8,bank,3240
9,fund,3042


## Bag of Words with Count Vectorizer

In [20]:
from nltk import word_tokenize 

stop = stopwords.words('english')

def lemmatize(word, pos='v'):
    return wordnet_lemma.lemmatize(word, pos=pos)

def tokenize_lemmatize(text):
    out = word_tokenize(text)
    out = [lemmatize(word,
#                      pos="v"
                    ) for word in out]
    out = [word for word in out if word not in stop]
    return out

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#bow_transformer = CountVectorizer().fit(data_train['preprocessed_text'])

tfidf_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,4))#, tokenizer=tokenize_lemmatize) #ngram_range=(1,4): paraules en cadena de 4
bow_transformer = tfidf_vectorizer.fit(data_train['preprocessed_text'])

In [22]:
print(len(bow_transformer.vocabulary_))
sample_spam = data_train['preprocessed_text'][2]
bow_sample_spam = bow_transformer.transform([sample_spam])

# Lets look at some vectorization example for a spam email
print(sample_spam)
print(bow_sample_spam)

1039785
nora cheryl email dozen memo haiti weekend please print organize inchrono order trip tomorrow send lauren thx 
  (0, 984916)	0.13943395159016175
  (0, 984915)	0.13943395159016175
  (0, 984914)	0.13943395159016175
  (0, 984867)	0.09574974695093985
  (0, 920774)	0.13943395159016175
  (0, 920773)	0.13943395159016175
  (0, 920772)	0.13943395159016175
  (0, 920532)	0.0739339755090635
  (0, 906026)	0.13943395159016175
  (0, 906025)	0.13943395159016175
  (0, 906024)	0.13943395159016175
  (0, 905811)	0.07247253535014989
  (0, 897642)	0.08254938766062028
  (0, 810871)	0.13943395159016175
  (0, 810870)	0.13943395159016175
  (0, 810295)	0.04856664610860974
  (0, 719029)	0.13943395159016175
  (0, 719028)	0.13943395159016175
  (0, 719027)	0.13943395159016175
  (0, 718956)	0.07681951881467543
  (0, 699645)	0.13943395159016175
  (0, 699644)	0.13943395159016175
  (0, 699640)	0.12825262885531874
  (0, 698430)	0.04033794301342055
  (0, 664896)	0.13943395159016175
  :	:
  (0, 570999)	0.1394339515

In [23]:
## let's vectorize all dataset
X_train = bow_transformer.transform(data_train['preprocessed_text'])
X_val  = bow_transformer.transform(data_val['preprocessed_text'])

#Let's print the shape of the vectorized dataset
print(X_train.shape)
print(X_val.shape)

(4174, 1039785)
(1790, 1039785)


## Train a Classifier

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


#Learn Classifier
clf = MultinomialNB().fit(X_train, label_train)
#Predict Val data
pred_val = clf.predict(X_val)

accuracy = accuracy_score(label_val, pred_val)
print(accuracy)
confusion_matrix(label_val, pred_val)

0.9692737430167597


array([[949,  55],
       [  0, 786]])

### TASK - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to find the **best feature representation**.

You can work with teams of two persons.

**Deadline**: 09/01/2022


In [23]:
data_test = pd.read_csv("data/kg_test.csv",encoding='latin-1')
X_test = bow_transformer.transform(data_test['text'].apply(clean_text))
pred_text = clf.predict(X_test)

submission_file = pd.DataFrame({'Id': data_test.index,'Category':pred_text})
submission_file.to_csv('data/to_submit.csv',index=False)

In [10]:
## Alejandro

In [15]:
SAMPLE_SIZE = None

# %pip install nltk sklearn
import nltk

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize   
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import nltk
import scipy
from scipy.sparse import hstack
from sklearn.preprocessing import Normalizer
import numpy as np

wordnet_lemma  = WordNetLemmatizer()

clf = MultinomialNB()

def lemmatize(word, pos='v'):
    return wordnet_lemma.lemmatize(word, pos=pos)

stop = stopwords.words('english')

def pre_process(text):
    text = re.sub('<.*?>', ' ', text) # Remove html tags. Added flag if "<div" is found later on process
    text = re.sub(r'=\d\w',' ',text) # Remove encoded symbols like =2E, =3A....    
    return text

def tokenize_lemmatize(text):
    out = word_tokenize(text)
    out = [lemmatize(word,
#                      pos="v"
                    ) for word in out]
    out = [word for word in out if word not in stop]
    return out

def process_data(df, vectorizer, normalizer, fit=False):
    x_text = df['text'].apply(pre_process)
    x_len = np.array(df['text'].apply(len)).reshape(-1, 1)
    if fit:
        vectorizer.fit(x_text)
        normalizer.fit(x_len)
    x_text = vectorizer.transform(x_text)
    x_len = normalizer.transform(x_len)
    
    x_html = np.array(df['text'].str.contains('<div', case=False)).reshape(-1, 1)
    X = hstack((x_text, x_len, x_html))
    if 'label' in df:
        Y = df['label']
    else:
        Y = None
    return X, Y
               
    
    
data = pd.read_csv("data/kg_train.csv",encoding='latin-1').sample(100)


# Transform Data
# Change vectorizer for testing if required
vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tokenize_lemmatize)
# vectorizer = CountVectorizer(tokenizer=tokenize_lemmatize, ngram_range=(1,4))

# Normalizer for the email lenght
normalizer = Normalizer()

X, Y = process_data(data, vectorizer, normalizer, fit=True)


# Evaluate
print('so the score is:')
print(cross_validate(clf, X, Y, n_jobs=4, cv=5, scoring='f1')['test_score'])

# Train
clf.fit(X, Y)

test_data = pd.read_csv("data/kg_test.csv",encoding='latin-1')

X_test, Y_test = process_data(test_data, vectorizer, normalizer)

result = clf.predict(X_test)

df = pd.DataFrame(result, columns=['Category'])
df['Id'] = df.index
df = df[['Id','Category']]
df.to_csv('to_submit.csv', index=False)
print('Done!')



so the score is:
[0.36363636 0.5        0.8        0.71428571 0.57142857]
Done!


In [21]:
clf.score(X, Y)

1.0