# SMS Spam Collection by TF-IDF(Limmatization)

- Dataset url : https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [1]:
# importing pandas for reading the dataset
import pandas as pd

In [2]:
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['label','message'])

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Data cleaning and preprocessing :
import re
import nltk
from nltk.corpus import stopwords

In [5]:
# importing Lemmatization : 
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [6]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub("[^a-zA-z]", " ", messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if word not in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

In [7]:
# creating TF-IDF model :
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(corpus).toarray()    # Independent feature

In [8]:
# Dependent feature :
y = pd.get_dummies(messages['label'], drop_first=True)

In [9]:
# Train-Test Split :
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [10]:
# Training model with Naive Bayes Classifier :
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
model = mnb.fit(x_train,y_train)

  return f(**kwargs)


In [11]:
y_pred = model.predict(x_test)

In [12]:
# confusion metrics :
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)
print(confusion_m)

[[955   0]
 [ 25 135]]


In [13]:
# Accuracy Score :
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
print(accuracy)

0.9775784753363229


In [14]:
# Classification report --- Extra
from sklearn.metrics import classification_report
classi = classification_report(y_test,y_pred)
print(classi)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       955
           1       1.00      0.84      0.92       160

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

