In [11]:
from pprint import pprint
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score, make_scorer,confusion_matrix
from sklearn.model_selection import GridSearchCV


In [5]:
data = pd.read_csv("processed_dataset.csv")

In [5]:
data.label.value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [7]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,Ugh just got outta class,24,0
1,ham,You have to pls make a note of all she.s expos...,166,5
2,ham,Hey whats up? U sleeping all morning?,37,2
3,ham,Ok lor...,9,3
4,ham,You are not bothering me but you have to trust...,63,2


In [62]:
data.loc[:,["message","length","punct"]]

Unnamed: 0,message,length,punct
0,Ugh just got outta class,24,0
1,You have to pls make a note of all she.s expos...,166,5
2,Hey whats up? U sleeping all morning?,37,2
3,Ok lor...,9,3
4,You are not bothering me but you have to trust...,63,2
...,...,...,...
1489,Want explicit SEX in 30 secs? Ring 02073162414...,90,3
1490,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,158,5
1491,Had your contract mobile 11 Mnths? Latest Moto...,160,8
1492,REMINDER FROM O2: To get 2.50 pounds free call...,147,3


### Split data

In [6]:
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["message","length","punct"]],data["label"], test_size= 0.3, random_state= 0)

In [136]:
type(X_train["message"].squeeze())

pandas.core.series.Series

In [140]:
type(X_train.loc[:,["message"]])

pandas.core.frame.DataFrame

### Define pipelines

In [45]:
numeric_features = ["length", "punct"]
categorical_features = ["message"]

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("squeez", FunctionTransformer(lambda x: x.squeeze())), # make sure you pass a series
        ("tfidf",TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("classifier",RandomForestClassifier(n_estimators=100,random_state=0))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)


### Applying 5 fld cross validation, with a hyperparameter search
### Score metric to be used is precision score

In [46]:
param_grid_svc = {
    "svc__C": [1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"]
}
best_svc = GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=5,scoring=make_scorer(precision_score,pos_label="spam"))

### Fit

In [47]:
best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)

using precission as the Best parameter to evaluate on (CV score=0.971):
{'svc__C': 1, 'svc__gamma': 'auto', 'svc__kernel': 'linear'}


In [48]:
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

              precision    recall  f1-score   support

         ham       0.95      0.98      0.97       227
        spam       0.98      0.95      0.96       222

    accuracy                           0.96       449
   macro avg       0.97      0.96      0.96       449
weighted avg       0.96      0.96      0.96       449



In [50]:
pprint(confusion_matrix(y_train, y_pred_svc_train))
pprint(confusion_matrix(y_test, y_pred_svc))

array([[517,   3],
       [  3, 522]])
array([[223,   4],
       [ 12, 210]])


### SVC Fit without CV or hyperparameter search

In [30]:
svc_mod = classifierSVC.fit(X_train,y_train)
rf_mod = classifierRF.fit(X_train, y_train)

In [37]:
y_pred_svc = svc_mod.predict(X_test)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

              precision    recall  f1-score   support

         ham       0.92      0.91      0.91       227
        spam       0.91      0.92      0.91       222

    accuracy                           0.91       449
   macro avg       0.91      0.91      0.91       449
weighted avg       0.91      0.91      0.91       449



In [33]:
confusion_matrix(y_test,y_pred_svc)

array([[206,  21],
       [ 18, 204]])

### Please note the benefits of cross validation

### Results with RF

In [34]:
y_pred_rf = rf_mod.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
report_rf["spam"]

{'precision': 0.9901477832512315,
 'recall': 0.9054054054054054,
 'f1-score': 0.9458823529411764,
 'support': 222.0}

SVC seems to be the path here, however, there is still needs to be hypterparameter tuning
as well as crossvalidation

In [35]:
confusion_matrix(y_test, y_pred_rf)

array([[225,   2],
       [ 21, 201]])

using precission as the Best parameter to evaluate on (CV score=0.944):
{'svc__C': 10, 'svc__kernel': 'linear'}


In [158]:
y_pred_svc = search.predict(X_test)
report_svc = classification_report(y_test, y_pred_svc, output_dict=True)
report_svc["spam"]

{'precision': 0.9813084112149533,
 'recall': 0.9459459459459459,
 'f1-score': 0.963302752293578,
 'support': 222.0}

In [1]:
from sklearn.metrics import confusion_matrix

In [2]:
confusion_matrix(y_test,y_pred_svc)

NameError: name 'y_test' is not defined