In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.naive_bayes import MultinomialNB

### Load data

In [3]:
df = pd.read_csv("data/fake_or_real_news.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


### Training with sklearn

In [5]:
# Target variable
y = df['label']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size = 0.33, random_state = 53)

In [6]:
# Vectorize: CountVectorizer
count_vectorizer = CountVectorizer(stop_words = 'english')
X_train_count_v = count_vectorizer.fit_transform(X_train)
X_test_count_v = count_vectorizer.transform(X_test)

In [7]:
print(count_vectorizer.get_feature_names_out()[:20])

['00' '000' '0000' '00000031' '000035' '00006' '0001' '0001pt' '000ft'
 '000km' '001' '0011' '002' '003' '004' '006' '006s' '007' '007s' '008']


In [8]:
print(X_train_count_v.A[:5])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
# Vectorize: TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
X_train_tfidf_v = tfidf_vectorizer.fit_transform(X_train) 
X_test_tfidf_v = tfidf_vectorizer.transform(X_test) 

In [10]:
print(tfidf_vectorizer.get_feature_names_out()[:20])

['00' '000' '0000' '00000031' '000035' '00006' '0001' '0001pt' '000ft'
 '000km' '001' '0011' '002' '003' '004' '006' '006s' '007' '007s' '008']


In [11]:
print(X_train_tfidf_v.A[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
#Compare vecotizers
count_df = pd.DataFrame(X_train_count_v.A, columns = count_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(X_train_tfidf_v.A, columns = tfidf_vectorizer.get_feature_names_out())

#print(count_df.head)
#print(tfidf_df.head)

In [13]:
#Inspect differences in columns
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)
print(count_df.equals(tfidf_df))

set()
False


### Naive Bayes

In [14]:
# CountVectorizer

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_count_v, y_train)

y_pred = nb_classifier.predict(X_test_count_v)

print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.893352462936394
              precision    recall  f1-score   support

        FAKE       0.92      0.86      0.89      1008
        REAL       0.88      0.93      0.90      1083

    accuracy                           0.89      2091
   macro avg       0.90      0.89      0.89      2091
weighted avg       0.89      0.89      0.89      2091



In [15]:
# TF-IDF

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf_v, y_train)

y_pred = nb_classifier.predict(X_test_tfidf_v)


print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8565279770444764
              precision    recall  f1-score   support

        FAKE       0.96      0.73      0.83      1008
        REAL       0.80      0.97      0.88      1083

    accuracy                           0.86      2091
   macro avg       0.88      0.85      0.85      2091
weighted avg       0.88      0.86      0.85      2091



In [16]:
# Test alpha parameter
alphas = np.arange(0,1,0.1)

def test_alphas(alpha):
    nb_classifier = MultinomialNB(alpha = alpha)
    nb_classifier.fit(X_train_tfidf_v, y_train)
    y_pred = nb_classifier.predict(X_test_tfidf_v)
    score = metrics.accuracy_score(y_test, y_pred)
    return score
    
for alpha in alphas:
    test_alphas(alpha)
    print('Alpha: ', alpha)
    print('Score: ', test_alphas(alpha))
    print()

Alpha:  0.0
Score:  0.8813964610234337

Alpha:  0.1
Score:  0.8976566236250598

Alpha:  0.2




Score:  0.8938307030129125

Alpha:  0.30000000000000004
Score:  0.8900047824007652

Alpha:  0.4
Score:  0.8857006217120995

Alpha:  0.5
Score:  0.8842659014825442

Alpha:  0.6000000000000001
Score:  0.874701099952176

Alpha:  0.7000000000000001
Score:  0.8703969392635102

Alpha:  0.8
Score:  0.8660927785748446

Alpha:  0.9
Score:  0.8589191774270684



In [17]:
# TF-IDF, alpha = 0.1

nb_classifier = MultinomialNB(alpha = 0.1)
nb_classifier.fit(X_train_tfidf_v, y_train)

y_pred = nb_classifier.predict(X_test_tfidf_v)


print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8976566236250598
              precision    recall  f1-score   support

        FAKE       0.92      0.86      0.89      1008
        REAL       0.88      0.93      0.90      1083

    accuracy                           0.90      2091
   macro avg       0.90      0.90      0.90      2091
weighted avg       0.90      0.90      0.90      2091



In [18]:
# Inspect

# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


FAKE [(-12.641778440826338, '0000'), (-12.641778440826338, '000035'), (-12.641778440826338, '0001'), (-12.641778440826338, '0001pt'), (-12.641778440826338, '000km'), (-12.641778440826338, '0011'), (-12.641778440826338, '006s'), (-12.641778440826338, '007'), (-12.641778440826338, '007s'), (-12.641778440826338, '008s'), (-12.641778440826338, '0099'), (-12.641778440826338, '00am'), (-12.641778440826338, '00p'), (-12.641778440826338, '00pm'), (-12.641778440826338, '014'), (-12.641778440826338, '015'), (-12.641778440826338, '018'), (-12.641778440826338, '01am'), (-12.641778440826338, '020'), (-12.641778440826338, '023')]
REAL [(-6.790929954967984, 'states'), (-6.765360557845787, 'rubio'), (-6.751044290367751, 'voters'), (-6.701050756752027, 'house'), (-6.695547793099875, 'republicans'), (-6.670191249042969, 'bush'), (-6.661945235816139, 'percent'), (-6.589623788689861, 'people'), (-6.559670340096453, 'new'), (-6.489892292073902, 'party'), (-6.452319082422527, 'cruz'), (-6.452076515575875, '

