In [None]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text
 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data[['Content', 'Content_len', 'punctuation_rate']], data['label'], test_size=0.2)

In [None]:
vectorisation = TfidfVectorizer(analyzer=clean_email)
vectorisation_model = vectorisation.fit(X_train['Content'])

vect_train = vectorisation_model.transform(X_train['Content'])
vect_test = vectorisation_model.transform(X_test['Content'])

final_train_vect = pd.concat([pd.DataFrame(vect_train.toarray()), X_train[['Content_len', 'punctuation_rate']].reset_index(drop=True)], axis=1)
final_test_vect = pd.concat([pd.DataFrame(vect_test.toarray()), X_test[['Content_len', 'punctuation_rate']].reset_index(drop=True)], axis=1)

final_train_vect

In [16]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import precision_recall_fscore_support as score
 

In [17]:
alg_RandomForest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
model = alg_RandomForest.fit(final_train_vect, Y_train)

In [18]:
predictions = model.predict(final_test_vect)
precision, recall, fscore, support = score(Y_test, predictions, pos_label='spam', average='binary')

In [19]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((predictions==Y_test).sum() / len(predictions),3)))

Precision: 1.0 / Recall: 0.608 / Accuracy: 0.946
