# **SPAM SMS Classifier**

In [1]:
import pandas as pd

## **Data Preprocessing**

In [2]:
df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1")

In [3]:
df = df[["v1", "v2"]].copy()
df.columns = ("target", "SMS")
df.head()

Unnamed: 0,target,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df["spam"] = df["target"] == "spam"
df.drop("target", axis = "columns", inplace = True)

In [5]:
df.head(3)

Unnamed: 0,SMS,spam
0,"Go until jurong point, crazy.. Available only ...",False
1,Ok lar... Joking wif u oni...,False
2,Free entry in 2 a wkly comp to win FA Cup fina...,True


In [17]:
df.spam.value_counts()

False    4825
True      747
Name: spam, dtype: int64

In [27]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.SMS,
    df.spam,
    test_size = 0.2,
    stratify = df.spam
)

## **Vectorization Of Text Data**

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df.SMS)

word_count_train = vectorizer.transform(x_train).toarray()
word_count_test = vectorizer.transform(x_test).toarray()

print(word_count_train[:5], word_count_test[:5], sep = "\n\n")

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## **Grid Search CV along with K - Fold Cross Validation**

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [9]:
grid = {
    "Random Forest" : {
        "model" : RandomForestClassifier(),
        "parameters" : {
            "n_estimators" : [25, 50, 100]
        }
    },

    "Support Vector Machine" : {
        "model" : SVC(),
        "parameters" : {
            "C" : [1, 10],
            "kernel" : ["linear", "sigmoid", "rbf"]
        }
    },

    "Gaussian Naive Bayes" : {
        "model" : GaussianNB(),
        "parameters" : {}
    },

    "K Nearest Neighbors" : {
        "model" : KNeighborsClassifier(),
        "parameters" : {
            "n_neighbors" : [5, 10, 20],
            "algorithm" : ["auto", "brute"]
        }
    }
}

In [10]:
from sklearn.model_selection import GridSearchCV

for Model in grid:
  clf = GridSearchCV(grid[Model]["model"], grid[Model]["parameters"])
  clf.fit(word_count_train, y_train)

  print(
      Model,
      clf.best_estimator_,
      clf.best_score_,
      clf.best_params_,
      sep = "\n"
  )

  print("\n\n")

Random Forest
RandomForestClassifier()
0.9735269989380602
{'n_estimators': 100}



Support Vector Machine
SVC(C=10, kernel='linear')
0.984070400064421
{'C': 10, 'kernel': 'linear'}



Gaussian Naive Bayes
GaussianNB()
0.9111516258750937
{}



K Nearest Neighbors
KNeighborsClassifier()
0.9075621687729312
{'algorithm': 'auto', 'n_neighbors': 5}





In [13]:
model = SVC(C = 10, kernel = "linear")
model.fit(word_count_train, y_train)
model.score(word_count_test, y_test)

0.9856502242152466

In [14]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(model, vectorizer.transform(df.SMS), df.spam, cv = 5)

In [15]:
score

array([0.98475336, 0.98475336, 0.98025135, 0.98294434, 0.98563734])

In [26]:
text = (
    "WINNER!!!!......FREE SERVICES : FREE recharge if you call quickly!!!!",
    "Send me the document"
)

text_vec = vectorizer.transform(text)
model.predict(text_vec.toarray())

array([ True, False])