In [1]:
import nltk
import pandas as pd
import re
 
from sklearn.feature_extraction.text import TfidfVectorizer
import string
 

data = pd.read_csv("../../Data/SMSSpamCollection.txt", sep='\t', header=None)
data.columns = ['label', 'Content']

en_stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data['Content_len'] = data['Content'].apply(lambda x: len(x) - x.count(" "))

def count_punctuation(text):
    binary_array = [1 for ch in text if ch in string.punctuation] 
    nb_ponctuation = sum(binary_array)
    total = len(text) - text.count(" ")
    return round(nb_ponctuation/(total), 4)*100

data['punctuation_rate'] = data['Content'].apply(lambda x: count_punctuation(x))


def clean_email(email):
    result = "".join([word for word in email if word not in string.punctuation])
    tokens = re.split('\W+', result)
    text = [ps.stem(word) for word in tokens if word not in en_stopwords]
    return text

 

vectorisation_full = TfidfVectorizer(analyzer=clean_email)
vect_final = vectorisation_full.fit_transform(data['Content'])


all_data = pd.concat([pd.DataFrame(vect_final.toarray()), data['Content_len'], data['punctuation_rate']], axis=1)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(all_data, data['label'], test_size=0.2)

In [6]:
def random_forest_hyper_params(n_decision_tree, depth):
    alg_RandomForest = RandomForestClassifier(n_estimators=n_decision_tree, max_depth=depth, n_jobs=-1)
    model = alg_RandomForest.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    precision, recall, fscore, support = score(Y_test, predictions, pos_label='spam', average='binary')
    print('nb decision tree: {} / Depth: {} | Precision: {} / Recall: {}'.format(
        n_decision_tree, depth, round(precision, 3), round(recall, 3)))

In [7]:
for ith_decision_tree in [100, 150, 200]:
    for depth in [30,40,50]:
        random_forest_hyper_params(ith_decision_tree, depth)

nb decision tree: 100 / Depth: 30 | Precision: 1.0 / Recall: 0.716
nb decision tree: 100 / Depth: 40 | Precision: 0.984 / Recall: 0.787
nb decision tree: 100 / Depth: 50 | Precision: 0.984 / Recall: 0.813
nb decision tree: 150 / Depth: 30 | Precision: 0.983 / Recall: 0.735
nb decision tree: 150 / Depth: 40 | Precision: 0.984 / Recall: 0.794
nb decision tree: 150 / Depth: 50 | Precision: 0.984 / Recall: 0.781
nb decision tree: 200 / Depth: 30 | Precision: 1.0 / Recall: 0.742
nb decision tree: 200 / Depth: 40 | Precision: 0.984 / Recall: 0.794
nb decision tree: 200 / Depth: 50 | Precision: 0.984 / Recall: 0.806


In [8]:
for ith_decision_tree in [100,200]:
    for depth in [50,100]:
        random_forest_hyper_params(ith_decision_tree, depth)

nb decision tree: 100 / Depth: 50 | Precision: 0.984 / Recall: 0.806
nb decision tree: 100 / Depth: 100 | Precision: 0.984 / Recall: 0.8
nb decision tree: 200 / Depth: 50 | Precision: 0.984 / Recall: 0.813
nb decision tree: 200 / Depth: 100 | Precision: 0.985 / Recall: 0.839


In [10]:
for ith_decision_tree in [200,300]:
    for depth in [100,200]:
        random_forest_hyper_params(ith_decision_tree, depth)

nb decision tree: 200 / Depth: 100 | Precision: 0.985 / Recall: 0.839
nb decision tree: 200 / Depth: 200 | Precision: 0.984 / Recall: 0.813
nb decision tree: 300 / Depth: 100 | Precision: 0.984 / Recall: 0.819
nb decision tree: 300 / Depth: 200 | Precision: 0.985 / Recall: 0.832
