In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
msg = pd.read_csv('naivetext.csv', names=['message','label'])

In [4]:
msg.head()

Unnamed: 0,message,label
0,I love this sandwich,pos
1,This is an amazing place,pos
2,I feel very good about these beers,pos
3,This is my best work,pos
4,What an awesome view,pos


In [5]:
msg.tail()

Unnamed: 0,message,label
13,I am sick and tired of this place,neg
14,What a great holiday,pos
15,That is a bad locality to stay,neg
16,We will have good fun tomorrow,pos
17,I went to my enemy's house today,neg


In [6]:
msg['labelnum'] = msg.label.map({'pos':1, 'neg':0})

In [7]:
msg.head()

Unnamed: 0,message,label,labelnum
0,I love this sandwich,pos,1
1,This is an amazing place,pos,1
2,I feel very good about these beers,pos,1
3,This is my best work,pos,1
4,What an awesome view,pos,1


In [8]:
msg.tail()

Unnamed: 0,message,label,labelnum
13,I am sick and tired of this place,neg,0
14,What a great holiday,pos,1
15,That is a bad locality to stay,neg,0
16,We will have good fun tomorrow,pos,1
17,I went to my enemy's house today,neg,0


In [9]:
X = msg.message

In [10]:
X

0                      I love this sandwich
1                  This is an amazing place
2        I feel very good about these beers
3                      This is my best work
4                      What an awesome view
5             I do not like this restaurant
6                  I am tired of this stuff
7                    I can't deal with this
8                      He is my sworn enemy
9                       My boss is horrible
10                 This is an awesome place
11    I do not like the taste of this juice
12                          I love to dance
13        I am sick and tired of this place
14                     What a great holiday
15           That is a bad locality to stay
16           We will have good fun tomorrow
17         I went to my enemy's house today
Name: message, dtype: object

In [11]:
y = msg.labelnum

In [12]:
y

0     1
1     1
2     1
3     1
4     1
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    0
14    1
15    0
16    1
17    0
Name: labelnum, dtype: int64

In [18]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25)

In [14]:
count_vect = CountVectorizer()

In [19]:
xtrain_dtm = count_vect.fit_transform(xtrain)

In [20]:
xtest_dtm = count_vect.transform(xtest)

In [21]:
print("Words present in text document")
print(count_vect.get_feature_names())

Words present in text document
['about', 'amazing', 'an', 'awesome', 'beers', 'best', 'boss', 'can', 'dance', 'deal', 'do', 'enemy', 'feel', 'fun', 'good', 'have', 'he', 'horrible', 'house', 'is', 'juice', 'like', 'love', 'my', 'not', 'of', 'place', 'restaurant', 'sandwich', 'sworn', 'taste', 'the', 'these', 'this', 'to', 'today', 'tomorrow', 'very', 'view', 'we', 'went', 'what', 'will', 'with', 'work']


In [22]:
df = pd.DataFrame(xtrain_dtm.toarray(), columns=count_vect.get_feature_names())

In [23]:
df.head()

Unnamed: 0,about,amazing,an,awesome,beers,best,boss,can,dance,deal,...,today,tomorrow,very,view,we,went,what,will,with,work
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [24]:
mclf = MultinomialNB().fit(xtrain_dtm, ytrain)

In [25]:
predicted = mclf.predict(xtest_dtm)

In [26]:
predicted

array([1, 1, 0, 1, 1], dtype=int64)

In [27]:
ytest

10    1
15    0
6     0
14    1
13    0
Name: labelnum, dtype: int64

In [29]:
print("Confusion Matrix")
print(metrics.confusion_matrix(ytest, predicted))

Confusion Matrix
[[1 2]
 [0 2]]


In [30]:
print("Accuracy ", metrics.accuracy_score(ytest, predicted))

Accuracy  0.6


In [31]:
print("Precision ", metrics.precision_score(ytest, predicted))

Precision  0.5


In [32]:
print("Recall ", metrics.recall_score(ytest, predicted))

Recall  1.0


In [33]:
newText = ["my boss is best"]

In [34]:
newText

['my boss is best']

In [37]:
newText_dtm = count_vect.transform(newText)

In [38]:
newText_predicted = mclf.predict(newText_dtm)

In [40]:
print("Predicted result for newText is ", newText_predicted)

Predicted result for newText is  [0]
