In [38]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import chardet

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
messages = pd.read_csv("/content/Spam_Book1.csv")
messages.columns = ['label', 'message']
messages.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [40]:
#Data cleaning and preprocessing

# Use While Stemming else comment out
ps = PorterStemmer()

# # Use While Lemmatization else comment out
# wordnet=WordNetLemmatizer()

corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    # Use While Stemming else comment out
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]

    # # Use While Lemmatization else comment out
    # review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')]

    review = ' '.join(review)
    corpus.append(review)
print(corpus[:5])

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat', 'ok lar joke wif u oni', 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli', 'u dun say earli hor u c alreadi say', 'nah think goe usf live around though']


In [41]:
# Use while Creating the Bag of Words model else comment out
cv = CountVectorizer(max_features=5000)

# # Use while Creating the TF-IDF model else comment out
# cv = TfidfVectorizer()

X = cv.fit_transform(corpus).toarray()

y0=pd.get_dummies(messages['label'])
print(y0)
y=y0.iloc[:,1].values
print(y)

      ham  spam
0       1     0
1       1     0
2       0     1
3       1     0
4       1     0
...   ...   ...
5567    0     1
5568    1     0
5569    1     0
5570    1     0
5571    1     0

[5572 rows x 2 columns]
[0 0 1 ... 0 0 0]


In [42]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [43]:
# Training model using Naive bayes classifier

spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred=spam_detect_model.predict(X_test)

In [45]:
# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)
conf_mat_df = pd.DataFrame(conf_mat, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(conf_mat_df)

[[938  11]
 [  6 160]]
                 Predicted Positive  Predicted Negative
Actual Positive                 938                  11
Actual Negative                   6                 160


In [46]:
# Accuracy
acc = accuracy_score(y_test, y_pred)
print(acc)

0.9847533632286996
