# Spam SMS Classifier - NLP Problem

In [1]:
## Importing all required libraries...
import pandas as pd
from sklearn import metrics
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
df_msg = pd.read_csv('SMSSpamCollection' , sep = '\t' , names = ['Category' , 'Message'])
df_msg.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
## Applying Lemmatization and removing stopwords...
lem = WordNetLemmatizer()

temp_cont = []
for i in df_msg['Message'].values:
    sentence = re.sub('[^a-zA-Z]' , ' ' , i)
    sentence = sentence.lower()
    sentence = sentence.split()
    sentence = [lem.lemmatize(word) for word in sentence if word not in set(stopwords.words('english'))]
    sentence = " ".join(sentence)
    temp_cont.append(sentence)

In [4]:
## Copying the preprocessed sentences to "message" feature....
df_msg['Message'] = temp_cont

In [5]:
df_msg.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [6]:
## Applying TF-IDF Vectorization.....
from sklearn.feature_extraction.text import TfidfVectorizer
Tfidf = TfidfVectorizer(max_features = 5500)

Tfidf.fit(df_msg['Message'])
X = Tfidf.transform(df_msg['Message']).toarray()

In [7]:
## Converting Target Variable to Dummy variables....
Cate = pd.get_dummies(df_msg['Category'] , drop_first = True , prefix = 'Target')

In [8]:
Cate.head()

Unnamed: 0,Target_spam
0,0
1,0
2,1
3,0
4,0


In [9]:
## Applying Train-Test Split....
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(X , Cate , test_size = 0.20 , random_state = 50)

### Logistic Regression:

In [10]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(x_train , y_train)

  return f(**kwargs)


LogisticRegression()

In [11]:
print("The Train accuracy is :" , round(log_reg.score(x_train , y_train)*100 , 3) , "%")

The Train accuracy is : 96.949 %


In [12]:
## Predicting test data....
y_test_pred = log_reg.predict(x_test)

print("The test accuracy is :" , round(metrics.accuracy_score(y_test , y_test_pred)*100 , 3) , "%")

The test accuracy is : 96.323 %


In [13]:
## Confusion Matrix of test results....
pd.DataFrame(metrics.confusion_matrix(y_test , y_test_pred))

Unnamed: 0,0,1
0,969,1
1,40,105


In [14]:
## Priniting the classification report.....
print(metrics.classification_report(y_test , y_test_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       970
           1       0.99      0.72      0.84       145

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



### Naive Baysian:


In [15]:
from sklearn.naive_bayes import MultinomialNB

NV = MultinomialNB()
NV.fit(x_train , y_train)

  return f(**kwargs)


MultinomialNB()

In [16]:
print("The Train accuracy is :" , round(NV.score(x_train , y_train)*100 , 3) , "%")

The Train accuracy is : 97.756 %


In [17]:
pd.DataFrame(metrics.confusion_matrix(y_test , NV.predict(x_test)))

Unnamed: 0,0,1
0,970,0
1,29,116


In [18]:
print("The test accuracy is :" , round(metrics.accuracy_score(y_test , NV.predict(x_test))*100 , 3) , "%")
print(metrics.classification_report(y_test , NV.predict(x_test)))

The test accuracy is : 97.399 %
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       970
           1       1.00      0.80      0.89       145

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



#### By the use of 'Naive Bayesian Classifier', we are able to attain the accuracy of 97% with the F1 Score of 89% and 99%.