# Imports


In [3]:
import numpy as np
import pandas as pd
import scipy

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [4]:
data = pd.read_csv("../data/cleaned-data.csv")

# Loading data


In [5]:
data

Unnamed: 0,text,is_reply,spam
0,naturally irresistible corporate identity lt r...,0,1
1,stock trading gunslinger fanny merrill muzo co...,0,1
2,nbelievable new homes made easy im wanting sho...,0,1
3,color printing special request additional info...,0,1
4,money get software cds software compatibility ...,0,1
...,...,...,...
5723,research development charges gpg forwarded shi...,1,0
5724,receipts visit jim thanks invitation visit lsu...,1,0
5725,enron case study update wow day super thank mu...,1,0
5726,interest david please call shirley crenshaw as...,1,0


In [6]:
vectoriser = TfidfVectorizer()
vectorised_data = vectoriser.fit_transform(data["text"])

In [7]:
vectorised_data[0, :20].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]])

In [22]:
X = scipy.sparse.hstack([vectorised_data, data["is_reply"].values.reshape(-1, 1)])
y = data["spam"].values

# Modelling


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)
model_1 = RandomForestClassifier()

In [24]:
def get_results(model, splitter, X, y):
    accuracy = []
    precision = []
    recall = []

    for i, (train_idx, val_idx) in enumerate(splitter.split(X, y)):
        X_train = X[train_idx, :]
        X_test = X[val_idx, :]

        y_train = y[train_idx]
        y_test = y[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy.append(accuracy_score(y_true=y_test, y_pred=y_pred))
        recall.append(recall_score(y_true=y_test, y_pred=y_pred))
        precision.append(precision_score(y_true=y_test, y_pred=y_pred))

    print(
        f"Average accuracy is {100*np.mean(accuracy):.4f} +- {100*np.std(accuracy):.4f} with max {100*np.max(accuracy):.4f}"
    )
    print(
        f"Average recall is {100*np.mean(recall):.4f} +- {100*np.std(recall):.4f} with max {100*np.max(recall):.4f}"
    )
    print(
        f"Average precision is {100*np.mean(precision):.4f} +- {100*np.std(precision):.4f} with max {100*np.max(precision):.4f}"
    )

In [25]:
splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
get_results(model_1, splitter, X_train, y_train)

Average accuracy is 96.8275 +- 0.5715 with max 97.6744
Average recall is 96.8275 +- 0.5715 with max 97.6744
Average precision is 96.9328 +- 0.5491 with max 97.7164


In [14]:
y_train

2859    0
4130    0
1137    1
3161    0
5286    0
       ..
3772    0
5191    0
5226    0
5390    0
860     1
Name: spam, Length: 3436, dtype: int64