In [3]:
from pprint import pprint
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score, make_scorer,confusion_matrix
from sklearn.model_selection import GridSearchCV


In [4]:
data = pd.read_csv("processed_dataset.csv")

In [5]:
data.label.value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [4]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Yup, no need. I'll jus wait 4 e rain 2 stop.",44,4
1,ham,How much would it cost to hire a hitman,39,0
2,ham,It wont b until 2.15 as trying 2 sort house ou...,60,3
3,ham,Great. Never been better. Each day gives even ...,71,2
4,ham,I had askd u a question some hours before. Its...,53,1


In [5]:
data.loc[:,["message","length","punct"]]

Unnamed: 0,message,length,punct
0,"Yup, no need. I'll jus wait 4 e rain 2 stop.",44,4
1,How much would it cost to hire a hitman,39,0
2,It wont b until 2.15 as trying 2 sort house ou...,60,3
3,Great. Never been better. Each day gives even ...,71,2
4,I had askd u a question some hours before. Its...,53,1
...,...,...,...
1489,Want explicit SEX in 30 secs? Ring 02073162414...,90,3
1490,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,158,5
1491,Had your contract mobile 11 Mnths? Latest Moto...,160,8
1492,REMINDER FROM O2: To get 2.50 pounds free call...,147,3


### Split data

In [6]:
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["message","length","punct"]],data["label"], test_size= 0.3, random_state= 0)

In [7]:
X_train

Unnamed: 0,message,length,punct
1438,You have 1 new message. Please call 08715205273,47,1
431,"Sorry, left phone upstairs. OK, might be hecti...",107,6
194,Usually the person is unconscious that's in ch...,117,3
240,Then u ask darren go n pick u lor... But i oso...,74,6
1309,Had your mobile 11 months or more? U R entitle...,154,2
...,...,...,...
763,Your free ringtone is waiting to be collected....,158,8
835,XCLUSIVE@CLUBSAISAI 2MOROW 28/5 SOIREE SPECIAL...,135,8
1216,Dorothy@kiefer.com (Bank of Granite issues Str...,156,26
559,Hi good mornin.. Thanku wish u d same..,39,4


In [13]:
type(X_train.loc[:,["message"]])

pandas.core.frame.DataFrame

### Define pipelines

In [14]:
numeric_features = ["length", "punct"]
categorical_features = ["message"]

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("squeez", FunctionTransformer(lambda x: x.squeeze())), # make sure you pass a series
        ("tfidf",TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("classifier",RandomForestClassifier(n_estimators=100,random_state=0))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)


### Applying 5 fld cross validation, with a hyperparameter search
### Score metric to be used is precision score

In [18]:
param_grid_svc = {
    "svc__C": [1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"]
}
best_svc = GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=5,scoring=make_scorer(precision_score,pos_label="spam"))

### Fit

In [19]:
best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)

using precission as the Best parameter to evaluate on (CV score=0.975):
{'svc__C': 10, 'svc__gamma': 'auto', 'svc__kernel': 'linear'}


In [20]:
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

              precision    recall  f1-score   support

         ham       0.96      0.98      0.97       227
        spam       0.98      0.96      0.97       222

    accuracy                           0.97       449
   macro avg       0.97      0.97      0.97       449
weighted avg       0.97      0.97      0.97       449



In [21]:
pprint(confusion_matrix(y_train, y_pred_svc_train))
pprint(confusion_matrix(y_test, y_pred_svc))

array([[520,   0],
       [  0, 525]])
array([[222,   5],
       [  9, 213]])


### SVC Fit without CV or hyperparameter search

In [23]:
svc_mod = classifierSVC.fit(X_train,y_train)
rf_mod = classifierRF.fit(X_train, y_train)

In [24]:
y_pred_svc = svc_mod.predict(X_test)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

              precision    recall  f1-score   support

         ham       0.92      0.93      0.92       227
        spam       0.92      0.91      0.92       222

    accuracy                           0.92       449
   macro avg       0.92      0.92      0.92       449
weighted avg       0.92      0.92      0.92       449



In [33]:
confusion_matrix(y_test,y_pred_svc)

array([[206,  21],
       [ 18, 204]])

### Please note the benefits of cross validation

### Results with RF

In [25]:
y_pred_rf = rf_mod.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
report_rf["spam"]

{'precision': 0.9805825242718447,
 'recall': 0.9099099099099099,
 'f1-score': 0.9439252336448598,
 'support': 222.0}

SVC seems to be the path here, however, there is still needs to be hypterparameter tuning
as well as crossvalidation

In [26]:
confusion_matrix(y_test, y_pred_rf)

array([[223,   4],
       [ 20, 202]])

using precission as the Best parameter to evaluate on (CV score=0.944):
{'svc__C': 10, 'svc__kernel': 'linear'}


In [27]:
y_pred_svc = search.predict(X_test)
report_svc = classification_report(y_test, y_pred_svc, output_dict=True)
report_svc["spam"]

NameError: name 'search' is not defined

In [1]:
from sklearn.metrics import confusion_matrix

In [28]:
confusion_matrix(y_test,y_pred_svc)

array([[210,  17],
       [ 19, 203]])