# 4.3.2 Training and testing the "fake news" model with CountVectorizer 

In [6]:
# Import the necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
# # Import TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB 

df=pd.DataFrame.from_csv('fake_or_real_news.csv')

In [7]:
# Print the head of df
print(df.head())

# Create a series to store the labels: y
y = df.label

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
                                             df['text'], y, 
                                             test_size=0.33, 
                                             random_state=53)

                                                   title  \
8476                        You Can Smell Hillary’s Fear   
10294  Watch The Exact Moment Paul Ryan Committed Pol...   
3608         Kerry to go to Paris in gesture of sympathy   
10142  Bernie supporters on Twitter erupt in anger ag...   
875     The Battle of New York: Why This Primary Matters   

                                                    text label  
8476   Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
10294  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
3608   U.S. Secretary of State John F. Kerry said Mon...  REAL  
10142  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
875    It's primary day in New York and front-runners...  REAL  


Count Vectors

In [8]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train.values)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test.values)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])
#Array mapping from feature integer indices to feature name

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


In [9]:
# # Initialize a TfidfVectorizer object: tfidf_vectorizer
# tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# # Transform the training data: tfidf_train 
# tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# # Transform the test data: tfidf_test 
# # DO NOT FIT TEST DATA because test data may include NEW WORDS
# tfidf_test = tfidf_vectorizer.transform(X_test.values)

# # Print the first 10 features
# print(tfidf_vectorizer.get_feature_names()[:10])

# # Print the first 5 vectors of the tfidf training data
# print(tfidf_train[:5])

In [10]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE','REAL'])
print(cm)

0.893352462936394
[[ 865  143]
 [  80 1003]]
