In [181]:
# Packages for Data Wrangling
import numpy as np
import pandas as pd

# Packages for Graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for NLP
import re
from collections import Counter
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Packages for ML
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Baseline multinomial naive bayes model with no preprocessing using only the "statement" feature

In [26]:
# Create a series to store the labels
X = liar_df.statement
y = liar_df.veracity

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.1, random_state = 0)

# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer()

# Transform the training data using only the 'text' column values 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values 
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

0    Wisconsin is on pace to double the number of layoffs this year.
Name: statement, dtype: object
0    false
Name: veracity, dtype: object
['000', '05', '06', '07', '08', '09', '10', '100', '106', '107']


In [27]:
# Instantiate a Multinomial Naive Bayes classifier
model = MultinomialNB()

# Fit the classifier to the training data
model.fit(count_train, y_train)

# Create the predicted tags: pred
pred = model.predict(count_test)

# Calculate the accuracy score
score = metrics.accuracy_score(y_test,pred)
print(score)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test,pred,labels=["true", "mostly-true", "false", "pants-fire"])
print(cm)

0.3670886075949367
[[ 5 12  8  0]
 [ 4 14  6  1]
 [ 3  8 10  0]
 [ 0  3  5  0]]
