In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score 
from sklearn import svm
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("spam.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
X = df["Message"].values
y = df["Category"].values

In [5]:
le = LabelEncoder()

In [6]:
df['Category'] = le.fit_transform(df['Category'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [8]:
c_vector = CountVectorizer()
X_train = c_vector.fit_transform(X_train)
X_test = c_vector.transform(X_test)

In [134]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [135]:
X.shape

(5572,)

In [136]:
y.shape

(5572,)

In [137]:
svc = SVC()

In [138]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

In [139]:
cv_score = cross_val_score(svc, X_train_smote, y_train_smote, cv=cv, n_jobs=5)

In [140]:
params = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [1e-3, 1e-2, 0.1, 1], 
    'kernel': ['rbf']
}

    

In [141]:
model = GridSearchCV(
    svc,
    param_grid= params,
    cv=cv,
    n_jobs=5,
    verbose=1
)

In [142]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [143]:
model.score(X_train, y_train)

1.0

In [144]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       979
        spam       1.00      0.90      0.95       136

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [145]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [146]:
rfc = RandomForestClassifier()

In [147]:
params1 = {
    "n_estimators": range(25, 100, 10),
    "max_depth": (10, 50, 10)
}

In [148]:
model1 = GridSearchCV(
    rfc,
    param_grid = params1,
    cv = cv,
    n_jobs = 5,
    verbose = 1
)

In [149]:
model1.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [150]:
model1.score(X_train, y_train)

0.9901278887143818

In [151]:
print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       979
        spam       1.00      0.76      0.86       136

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [152]:
dtc = DecisionTreeClassifier()

In [153]:
params2 = {
    "max_depth": range(25, 100, 10),
    "min_samples_split": range(10, 50, 10),
    "min_samples_leaf": range(10, 50, 10)
}

In [154]:
model2 = GridSearchCV(
    dtc,
    param_grid = params2,
    cv = cv,
    n_jobs = 5,
    verbose = 1
)


In [155]:
model2.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [156]:
model2.score(X_train, y_train)

0.9647745120035899

In [157]:
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       979
        spam       0.88      0.75      0.81       136

    accuracy                           0.96      1115
   macro avg       0.92      0.87      0.89      1115
weighted avg       0.96      0.96      0.96      1115



In [10]:
import joblib

In [159]:
best_model = model.best_estimator_

In [161]:
joblib.dump(best_model, "SVM_model.pkl")

['SVM_model.pkl']

In [11]:
joblib.dump(c_vector, "vectorizer.pkl")

['vectorizer.pkl']