In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer , WordNetLemmatizer
nltk.download('wordnet')
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
df = pd.read_csv(r"/content/SMSSpamCollection.csv" , sep='\t', names=['label','message'],encoding='utf-8')

In [None]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.shape

(5572, 2)

In [None]:
lamitizer = WordNetLemmatizer()

#1. Data Preprocessing
Data Cleaning by removing all punctuations etc, removing stopwords, performing Tkenization using ".split()" and doing Lemmatization

In [None]:
train = [] #empty array to store the sentences after transformation
for i in range(len(df)):
    review = re.sub('^[a-zA-Z]' , ' ' , df['message'][i])
    review = review.lower()
    review = review.split()
    review = [lamitizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    train.append(review)

In [None]:
print(train[0])

jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...


#2. Data Preprocessing

Feature extraction by converting text to vectors using BAG OF WORDS

In [None]:
cv = CountVectorizer(max_features=5000)
x = cv.fit_transform(train).toarray()

Now when we are done with the features i.e the x, we need to figure out the y "label" too which is our output column.
Now this y has 'ham' &'spam' both are strings so not understandable by machine.
So we use "pandas.get_dummies(df['label'])"- this basically creates a dataset with columns'ham' and 'spam' with values 0 or 1 based on which it is

In [None]:
y = pd.get_dummies(df['label'])  #y=f(x) -the label of a supervised learning
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


But we only want to take the spam column because we are checking whether the mail is spam or not so we use 'y = y.iloc[:,1].values'

In [None]:
y = y.iloc[:,1].values
print(y)

[0 0 1 ... 0 0 0]


**TEST-TRAIN divide of x**
test=20%of x
train=80% of x
Similarly for y

In [None]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = 0.20,random_state = 0)

Training model using Multinomial Nives Bayes model which works best with NLP

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mb = MultinomialNB()
mb.fit(train_x,train_y)


MultinomialNB()

PREDICTION

In [None]:
pred = mb.predict(test_x) #pred is the predicted value on test_x
print(pred)

[0 1 0 ... 0 1 0]


CONFUSION MATRIX

In [None]:
cm = confusion_matrix(pred,test_y) 
print(cm)

[[944   7]
 [ 11 153]]


ACCURACY CHECK

In [None]:
acc = accuracy_score(pred,test_y)
acc

0.9838565022421525