In [37]:
from pprint import pprint
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score,f1_score, make_scorer,confusion_matrix, average_precision_score
from sklearn.model_selection import GridSearchCV


In [89]:
data = pd.read_csv("processed_dataset.csv")
#data = pd.read_csv("spam.tsv",sep="\t")
data.label.value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [4]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [25]:
data.loc[:,["message","length","punct"]]

Unnamed: 0,message,length,punct
0,"Go until jurong point, crazy.. Available only ...",111,9
1,Ok lar... Joking wif u oni...,29,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,U dun say so early hor... U c already then say...,49,6
4,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,160,8
5568,Will ü b going to esplanade fr home?,36,1
5569,"Pity, * was in mood for that. So...any other s...",57,7
5570,The guy did some bitching but I acted like i'd...,125,1


### Split data

In [90]:
X_train, X_test, y_train, y_test =\
train_test_split(data.loc[:,["message","length","punct"]],data["label"], test_size= 0.3, random_state= 0)

In [28]:
type(X_train.loc[:,["message"]])

pandas.core.frame.DataFrame

### Define pipelines

In [91]:
numeric_features = ["length", "punct"]
categorical_features = ["message"]

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        ("squeez", FunctionTransformer(lambda x: x.squeeze())), # make sure you pass a series
        ("tfidf",TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("rf",RandomForestClassifier(n_estimators=100,random_state=0))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)

classifierLR = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("logreg", LogisticRegression(solver="liblinear", random_state=0,max_iter=1000))
    ]
)


### Applying 5 fld cross validation, with a hyperparameter search
### Score metric to be used is precision score

In [92]:
from sklearn.model_selection import StratifiedKFold

## on classification, stratified is used by default
cv = StratifiedKFold(n_splits=5, shuffle=False)
#cv = 5

param_grid_svc = {
    "svc__C": [1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"]
}



param_grid_rf = {
    'rf__n_estimators': [100, 300],
    'rf__max_depth': [None, 20, 50],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__class_weight': [None, 'balanced']
}

param_grid_logreg = {
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear','saga'],
    'logreg__class_weight': [None, 'balanced']
}


score_criteria = make_scorer(f1_score,pos_label="spam")

best_svc = GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=cv,scoring=score_criteria)
best_rf = GridSearchCV(classifierRF, param_grid_rf,n_jobs=8,cv=cv,scoring=score_criteria)
best_logreg = GridSearchCV(classifierLR, param_grid_logreg,n_jobs=8,cv=cv,scoring=score_criteria)

### Fit

In [8]:
import sklearn
print(sklearn.__version__)

1.6.1


In [93]:
models = {
    'svc':best_svc,
    'rf': best_rf,
    'logreg':best_logreg
}

for model_name,model in models.items():
    model.fit(X_train, y_train)
    print("Best parameter to evaluate on (CV score=%0.3f):" % model.best_score_)
    print("Best params were :")
    print(model.best_params_)



Best parameter to evaluate on (CV score=0.960):
Best params were :
{'svc__C': 10, 'svc__gamma': 'auto', 'svc__kernel': 'linear'}
Best parameter to evaluate on (CV score=0.956):
Best params were :
{'rf__class_weight': 'balanced', 'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 300}




Best parameter to evaluate on (CV score=0.961):
Best params were :
{'logreg__C': 10, 'logreg__class_weight': None, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}




In [94]:
for model_name, model in models.items():
    print(f"{model_name}: {model.best_score_}")

svc: 0.9604764785475514
rf: 0.9559905561950938
logreg: 0.9614524883014808


In [95]:
for model_name, model in models.items():
    print(f"{model_name}: {model.best_score_}")
    y_pred_svc = model.predict(X_test)
    print(classification_report(y_test, y_pred_svc))
    # print(report_svc)

svc: 0.9604764785475514
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       227
        spam       0.99      0.96      0.97       222

    accuracy                           0.98       449
   macro avg       0.98      0.98      0.98       449
weighted avg       0.98      0.98      0.98       449

rf: 0.9559905561950938
              precision    recall  f1-score   support

         ham       0.93      1.00      0.96       227
        spam       1.00      0.92      0.96       222

    accuracy                           0.96       449
   macro avg       0.96      0.96      0.96       449
weighted avg       0.96      0.96      0.96       449

logreg: 0.9614524883014808
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       227
        spam       0.99      0.96      0.98       222

    accuracy                           0.98       449
   macro avg       0.98      0.98      0.98       449


In [58]:


y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       227
        spam       0.99      0.96      0.97       222

    accuracy                           0.98       449
   macro avg       0.98      0.98      0.98       449
weighted avg       0.98      0.98      0.98       449



In [80]:
pprint(confusion_matrix(y_train, y_pred_svc_train))
confusion_matrix(y_test, y_pred_svc)

array([[3374,    0],
       [   0,  526]])


array([[1445,    6],
       [  13,  208]])

In [21]:
224/237

0.9451476793248945

### SVC Fit without CV or hyperparameter search

In [23]:
svc_mod = classifierSVC.fit(X_train,y_train)
rf_mod = classifierRF.fit(X_train, y_train)

In [24]:
y_pred_svc = svc_mod.predict(X_test)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

              precision    recall  f1-score   support

         ham       0.92      0.93      0.92       227
        spam       0.92      0.91      0.92       222

    accuracy                           0.92       449
   macro avg       0.92      0.92      0.92       449
weighted avg       0.92      0.92      0.92       449



In [33]:
confusion_matrix(y_test,y_pred_svc)

array([[206,  21],
       [ 18, 204]])

### Please note the benefits of cross validation

### Results with RF

In [25]:
y_pred_rf = rf_mod.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
report_rf["spam"]

{'precision': 0.9805825242718447,
 'recall': 0.9099099099099099,
 'f1-score': 0.9439252336448598,
 'support': 222.0}

SVC seems to be the path here, however, there is still needs to be hypterparameter tuning
as well as crossvalidation

In [26]:
confusion_matrix(y_test, y_pred_rf)

array([[223,   4],
       [ 20, 202]])

using precission as the Best parameter to evaluate on (CV score=0.944):
{'svc__C': 10, 'svc__kernel': 'linear'}


In [27]:
y_pred_svc = search.predict(X_test)
report_svc = classification_report(y_test, y_pred_svc, output_dict=True)
report_svc["spam"]

NameError: name 'search' is not defined

In [1]:
from sklearn.metrics import confusion_matrix

In [28]:
confusion_matrix(y_test,y_pred_svc)

array([[210,  17],
       [ 19, 203]])