In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
data = pd.read_excel("textdata.xlsx")

In [4]:
data

Unnamed: 0,Sentence,Label
0,I love spending time with my family.,Positive
1,The weather today is absolutely beautiful.,Positive
2,"This movie is amazing, I highly recommend it.",Positive
3,He is always so kind and helpful.,Positive
4,My vacation was ruined by the terrible hotel s...,Negative
5,The food at that restaurant was awful.,Negative
6,She was very disappointed with the product qua...,Negative
7,I had a horrible experience with their custome...,Negative
8,I'm thrilled about the new job opportunity.,Positive
9,The traffic was unbearable this morning.,Negative


In [5]:
vectorizer = CountVectorizer()

In [6]:
X = vectorizer.fit_transform(data["Sentence"])

In [7]:
X

<20x90 sparse matrix of type '<class 'numpy.int64'>'
	with 132 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_tr, X_te, y_tr, y_te = train_test_split(X, data["Label"], test_size=0.2, random_state=33)

In [10]:
X_tr

<16x90 sparse matrix of type '<class 'numpy.int64'>'
	with 105 stored elements in Compressed Sparse Row format>

In [11]:
X_te

<4x90 sparse matrix of type '<class 'numpy.int64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [12]:
y_tr

5     Negative
0     Positive
11    Negative
4     Negative
19    Negative
1     Positive
14    Positive
10    Positive
12    Positive
16    Negative
13    Negative
6     Negative
3     Positive
9     Negative
2     Positive
7     Negative
Name: Label, dtype: object

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [14]:
mnb = MultinomialNB()

In [15]:
mnb.fit(X_tr, y_tr)

In [16]:
y_pred = mnb.predict(X_te)

In [17]:
from sklearn.metrics import accuracy_score, classification_report

In [18]:
accuracy_score(y_te, y_pred)

0.25

In [19]:
mnb.score(X_tr, y_tr)

1.0

In [20]:
text = ["It was wonderful experience looking at sun in the morning"]

In [21]:
t = vectorizer.transform(text)
t

<1x90 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [22]:
mnb.predict(t)

array(['Negative'], dtype='<U8')

In [23]:
text2 = ["I love that movie, it was beautiful experience"]
t2 = vectorizer.transform(text2)
mnb.predict(t2)

array(['Positive'], dtype='<U8')

In [27]:
df = pd.read_csv("news.csv")

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [29]:
df.drop(["Unnamed: 0", "title"], axis=1, inplace=True)

In [30]:
df.head()

Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,It's primary day in New York and front-runners...,REAL


In [35]:
X = df.drop("label", axis=1)
y = df["label"]

In [36]:
X.head()

Unnamed: 0,text
0,"Daniel Greenfield, a Shillman Journalism Fello..."
1,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,U.S. Secretary of State John F. Kerry said Mon...
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T..."
4,It's primary day in New York and front-runners...


In [37]:
text_vect = CountVectorizer()

In [40]:
X1 = text_vect.fit_transform(df["text"])

In [41]:
X1

<6335x67659 sparse matrix of type '<class 'numpy.int64'>'
	with 2158282 stored elements in Compressed Sparse Row format>

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X1, df["label"], train_size=0.75, random_state=24)

In [45]:
X_train

<4751x67659 sparse matrix of type '<class 'numpy.int64'>'
	with 1623450 stored elements in Compressed Sparse Row format>

In [46]:
y_train

5529    FAKE
5485    FAKE
3530    REAL
4497    REAL
3777    FAKE
        ... 
2193    REAL
3473    REAL
343     REAL
899     FAKE
4514    REAL
Name: label, Length: 4751, dtype: object

In [47]:
multi_NB = MultinomialNB()

In [48]:
multi_NB.fit(X_train, y_train)

In [49]:
y_pred2 = multi_NB.predict(X_test)

In [51]:
accuracy_score(y_test, y_pred2)

0.8888888888888888

In [52]:
txt = ["Killing Obama administration rules, dismantling Obamacare and pushing through tax reform are on the early to-do list."]

In [54]:
new_txt = text_vect.transform(txt)

In [55]:
multi_NB.predict(new_txt)

array(['REAL'], dtype='<U4')

In [56]:
txt2 = ["Lawmakers appeared hopeful that the standoff was nearing an end after the senator behind it told colleagues that he would work to find a solution"]

In [57]:
new_txt2 = text_vect.transform(txt2)

In [58]:
multi_NB.predict(new_txt2)

array(['REAL'], dtype='<U4')