In [3]:
import pandas as pd

In [6]:
df = pd.read_csv(r'.\\Datasets\\SMSSpamCollection',sep='\t',names=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning and Preprocessing

In [16]:
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

lemma = WordNetLemmatizer()


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
def get_clean_text(message):
    pattern = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    clean_text = re.sub('[^a-zA-Z]',' ',message)
    clean_text = clean_text.lower()
    clean_text = clean_text.split()
    clean_text = [lemma.lemmatize(text) for text in clean_text if text not in set(stopwords.words('english'))]
    clean_text = ' '.join(clean_text)
    return clean_text

In [27]:
df['clean_text'] = df['message'].apply(get_clean_text)

##### Bag of Words (CountVectorizer)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(df['clean_text']).toarray()


In [39]:
X.shape

(5572, 5000)

In [40]:
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

##### Train Test Split

In [45]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

#### Training a model

In [48]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB()
spam_detect_model.fit(x_train,y_train)


MultinomialNB()

In [49]:
y_pred = spam_detect_model.predict(x_test)

In [54]:
for i in y_pred:
    print(f'{y_pred[i]}|{y_test[i]}')


0|0
1|1
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
1|1
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
1|1
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
1|1
1|1
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
1|1
0|0
0|0
0|0
1|1
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
1|1
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
1|1
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
0|0
1|1


In [55]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.986244019138756