In [48]:
import nltk
import pandas as pd

In [49]:
sms_data = pd.read_csv("SMSSpamCollection-Copy1",sep="\t",names=["label", "message"])

In [50]:
sms_data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
#Text cleaning and preprocessing

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [52]:
sms_data['message'][1]

'Ok lar... Joking wif u oni...'

In [53]:
lemmatizer = WordNetLemmatizer()

corpus = []

for i in range(0,len(sms_data)):
    clean = re.sub('[^a-zA-Z]'," ",sms_data['message'][i])
    clean = clean.lower()
    clean = clean.split()
    
    clean = [lemmatizer.lemmatize(word) for word in clean if word not in set(stopwords.words('english'))]
    clean = " ".join(clean)
    corpus.append(clean)

In [54]:
#Bagging of words using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 2500)
X = tfidf.fit_transform(corpus).toarray()

In [55]:
# Label encoding for dependent variable

y = pd.get_dummies(sms_data['label'])
y = y.iloc[:,1].values

In [56]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [57]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

## Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [41]:
y_prediction = spam_detect_model.predict(X_test)

In [42]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_prediction)
acc

0.9814593301435407

In [43]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_prediction)
cm

array([[1451,    0],
       [  31,  190]], dtype=int64)

## Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier 
rf_spam_detect_model = RandomForestClassifier().fit(X_train,y_train)
y_prediction = rf_spam_detect_model.predict(X_test)
acc = accuracy_score(y_test,y_prediction)
cm = confusion_matrix(y_test,y_prediction)
print(cm)
print(acc)

[[1449    2]
 [  31  190]]
0.9802631578947368


## Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression 
lr_spam_detect_model = LogisticRegression().fit(X_train,y_train)
y_prediction = lr_spam_detect_model.predict(X_test)
acc = accuracy_score(y_test,y_prediction)
cm = confusion_matrix(y_test,y_prediction)
print(cm)
print(acc)

[[1449    2]
 [  52  169]]
0.9677033492822966


## SVM

In [63]:
from sklearn import svm
svm_spam_detect_model = svm.SVC().fit(X_train,y_train)
y_prediction = lr_spam_detect_model.predict(X_test)
acc = accuracy_score(y_test,y_prediction)
cm = confusion_matrix(y_test,y_prediction)
print(cm)
print(acc)

[[1449    2]
 [  52  169]]
0.9677033492822966
