In [2]:
import pandas as pd

#specifying columns for spam_dataframe text file
features = ['label', 'message']

#reading spam_dataset file for processing
sms = pd.read_csv('sms.txt', header=None, names=features,sep='\t')
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#counting number of ham (good) and spam (bad)
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [4]:
# convert label to a numerical variable i.e. transforming ham to 0 and spam to 1
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [5]:
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
#preparing x and y for machine learnign
X = sms.message
y = sms.label_num
X.shape

(5572,)

In [8]:
# 1. import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer


In [9]:
#1. stop_words='english'  remove English stop words 
# 2. include 1-grams and 2-grams
#3. ngram_range used to differentiate between "Happy", "Not Happy", "Very Happy"
# 4. max_df=0.5 used to ignore terms that appear in more than 50% of the document
# 5. min_def=2 used to only keep terms that appear in at least 2 documents
#stop_words='english',ngram_range=(1, 2), min_df=2,max_df=0.5
vect_combined= CountVectorizer()


In [10]:
#transforming train and test data

X = vect_combined.fit_transform(X)


In [11]:
# split X and y into training and testing sets
# by default, it splits 75% training and 25% test
# random_state=1 for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [12]:
X_train.shape

(4179, 8713)

In [13]:
X_test.shape

(1393, 8713)

In [14]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model

# calculate accuracy of class predictions

nb = MultinomialNB()

nb.fit(X_train, y_train)

#testing the model
print(nb.score(X_test,y_test))





0.9834888729361091


In [15]:
from sklearn import metrics
y_pred_class = nb.predict(X_test)
metrics.confusion_matrix(y_test, y_pred_class)

array([[1195,   13],
       [  10,  175]], dtype=int64)

In [None]:
#implementing the models 

message=input("Input sample message:")
data=[message]

In [None]:
message_df=pd.DataFrame(data, columns=['message'])
message_df

In [None]:
#transforming message_df dataframe
message_df_t= vect_combined.transform(message_df)

In [None]:
y=nb.predict(message_df_t)
print(y[0])

In [None]:
if(y[0]==0):
    print('Message is Ham')
else:
    print('Message is spam')