In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re,collections
from math import log
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.cluster import KMeans

In [40]:
df_real = pd.read_csv('True.csv')
df_real['RealNews?'] = True
df_fake = pd.read_csv('Fake.csv')
df_fake['RealNews?'] = False
df = df_real.append(df_fake)

In [41]:
df.columns

Index(['title', 'text', 'subject', 'date', 'RealNews?'], dtype='object')

In [42]:
df['document']=df[['title','text']].agg(' '.join,axis=1).apply(lambda x:x.lower())
df['text']=df['text'].apply(lambda x:x.lower())

# Question 1

In [43]:
# CountVectorizer produces a feature matrix of token counts for text. 
tf_vectorizer = CountVectorizer(stop_words='english')
x = tf_vectorizer.fit_transform(df['text'])
#fit the lda model with 10 topics
lda = LatentDirichletAllocation(n_components=10,random_state=0)
lda.fit(x)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [44]:
#print the top_n words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [45]:
#print the topics of lda model
print("\nTopics in LDA model:")
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: trump said republican house president senate republicans tax washington reuters white obama congress senator new democrats donald democratic year plan
Topic #1: police said year people city old told killed officers man family years arrested according death attack shooting group black officer
Topic #2: said state million 000 election government year money billion company new according percent companies reuters federal city public states vote
Topic #3: court said law trump president states obama federal order supreme immigration justice judge administration department united case legal ban climate
Topic #4: trump said russia russian president fbi clinton intelligence investigation campaign house news election information committee comey security washington director department
Topic #5: clinton party hillary election percent said democratic voters sanders new campaign vote poll political presidential candidate support democrats polls year
Topic #6: trump pe

In [46]:
print(lda.components_.shape)
len(tf_feature_names)

(10, 121690)


121690

## Most of the topics are related to politics. Topics such as fbi investigation, law on climate change, tax plan are found by the lda model. Its a good representation of real-world topics

# Question 2

In [47]:
#randomly sample 5 real news and 5 fake news
df_sampled = df[df['RealNews?']==True].sample(n=5).append(df[df['RealNews?']==False].sample(n=5))

In [48]:
df_sampled

Unnamed: 0,title,text,subject,date,RealNews?,document
20627,Trump visit to Britain still unfixed nine mont...,london (reuters) - nine months after prime min...,worldnews,"September 8, 2017",True,trump visit to britain still unfixed nine mont...
8423,Clinton leads Trump by eight points: Reuters/I...,new york (reuters) - democratic presidential c...,politicsNews,"August 19, 2016",True,clinton leads trump by eight points: reuters/i...
18904,Iraq sends delegation to Iran 'to coordinate m...,baghdad (reuters) - a top ranking delegation f...,worldnews,"September 27, 2017",True,iraq sends delegation to iran 'to coordinate m...
19052,China busts underground bank in Guangzhou: Chi...,shanghai (reuters) - chinese police have broke...,worldnews,"September 26, 2017",True,china busts underground bank in guangzhou: chi...
15771,Myanmar sees 'bad consequences' if U.S. impose...,yangon (reuters) - proposed u.s. sanctions tar...,worldnews,"November 3, 2017",True,myanmar sees 'bad consequences' if u.s. impose...
15507,https://100percentfedup.com/video-hillary-aske...,https://100percentfedup.com/video-hillary-aske...,politics,https://100percentfedup.com/video-hillary-aske...,False,https://100percentfedup.com/video-hillary-aske...
22173,BIGGER THAN SNOWDEN: Wikileaks ‘Vault 7’ Class...,he who controls the spice controls the univer...,US_News,"March 13, 2017",False,bigger than snowden: wikileaks ‘vault 7’ class...
19886,WHOA! DEMOCRATIC Strategist Gives Crooked Hill...,"the most unpopular, deplorable woman in americ...",left-news,"Oct 2, 2016",False,whoa! democratic strategist gives crooked hill...
5524,Sean Hannity Just Made A Bizarre Implication ...,sean hannity made a bizarre implication during...,News,"July 10, 2016",False,sean hannity just made a bizarre implication ...
20061,WHY TRUMP SUPPORTERS ARE LAUGHING After WikiLe...,fans of #crookedhillary are not going to like ...,left-news,"Aug 28, 2016",False,why trump supporters are laughing after wikile...


## Predict on test documents

In [56]:
#get topics for each document
x_sampled = tf_vectorizer.transform(df_sampled['text'])
topics_prob = lda.transform(x_sampled)
#print(topics_prob.shape)
docs_topics = np.argmax(topics_prob,axis=1)
for i in range(len(docs_topics)):
    print("document {} belongs to topic {}".format(i+1, docs_topics[i]))

document 1 belongs to topic 8
document 2 belongs to topic 5
document 3 belongs to topic 9
document 4 belongs to topic 9
document 5 belongs to topic 9
document 6 belongs to topic 6
document 7 belongs to topic 4
document 8 belongs to topic 5
document 9 belongs to topic 6
document 10 belongs to topic 4


## topic 9(US and North Korea nuclear war)is prevalent in real news articles. topics 4 and 6 are prevalent in fake news articles.

# Question 3

In [58]:
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True)
print(df_train.shape,df_test.shape)


(35918, 6) (8980, 6)


In [68]:
# CountVectorizer produces a feature matrix of token counts for text. 
tf_vectorizer = CountVectorizer(stop_words='english')
x_train = tf_vectorizer.fit_transform(df_train['text'])
#fit the lda model with 10 topics
lda = LatentDirichletAllocation(n_components=10,random_state=0)
lda.fit(x_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [69]:
#get lda vectors for train and test docs
x_train = lda.transform(x_train)
x_test = tf_vectorizer.transform(df_test['text'])
x_test = lda.transform(x_test)
y_train,y_test= df_train['RealNews?'],df_test['RealNews?']
print("x_train shape:",x_train.shape)
print("y_train shape:",y_train.shape)
print("x_test shape:",x_test.shape)
print("y_test shape:",y_test.shape)


x_train shape: (35918, 10)
y_train shape: (35918,)
x_test shape: (8980, 10)
y_test shape: (8980,)


In [70]:
#train the logistic regression clf
lr = LogisticRegression(random_state=0)
lr.fit(x_train,y_train)

#prediction
y_pred = lr.predict(x_test)
y_prob = lr.predict_proba(x_test)[:,1]

#model evaluation
print("Logistic regression model performance on lda vectors:")
print("Accuracy score is {}".format(accuracy_score(y_test,y_pred)))
print("Precision score is {}".format(precision_score(y_test, y_pred)))
print("Recall score is {}".format(recall_score(y_test, y_pred)))
print("F1 score is {}".format(f1_score(y_test, y_pred)))

print("Area Under the Curve(ROC curve) is {}".format(roc_auc_score(y_test,y_prob)))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()

print("true negative", tn)
print("false positive", fp)
print("false negative", fn)
print("true positive", tp)
specificity = tn / (tn+fp)
print("specificity is {}".format(specificity))



Logistic regression model performance on lda vectors:
Accuracy score is 0.884075723830735
Precision score is 0.8751450452541193
Recall score is 0.8823116518483856
F1 score is 0.8787137364557847
Area Under the Curve(ROC curve) is 0.9532763508825242
true negative 4168
false positive 538
false negative 503
true positive 3771
specificity is 0.8856778580535487


In [81]:
#get the most useful topics for classification
topics_importance_score = lr.coef_
useful_topics = np.flip(np.argsort(topics_importance_score))
print("useful topics ranked from most useful to least useful", useful_topics)
print("scores for corresponding topics",topics_importance_score)

useful topics ranked from most useful to least useful [[2 3 9 1 4 0 8 6 7 5]]
scores for corresponding topics [[ 1.38789583  2.53022141  5.22121393  4.06925796  1.45932434 -7.8030476
  -2.16746468 -6.43829936 -1.63321257  3.21623314]]


## The top 3 most useful topics in classification of real or fake news is topic 2, topic 3 and topic 9.

# Question 4

In [82]:
#take only real news
df_real = df[df['RealNews?']==True]

In [84]:
#get the lda vectors for real news docs
# CountVectorizer produces a feature matrix of token counts for text. 
tf_vectorizer = CountVectorizer(stop_words='english')
x = tf_vectorizer.fit_transform(df_real['text'])
#fit the lda model with 10 topics
lda = LatentDirichletAllocation(n_components=10,random_state=0)
lda.fit(x)
x = lda.transform(x)
print("shape of real news lda vectors is", x.shape)

shape of real news lda vectors is (21417, 10)


In [101]:
#cluster the docs using k-means
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [103]:
#get 5 documents from each cluster
y_pred = kmeans.labels_
n_clusters=10
#stores docs index for each cluster 
docs_index_by_clusters=[]
for i in range(n_clusters):
    #get all docs index for each cluster
    docs_index = np.where(y_pred==i)[0]
    #consider only 5 docs for each cluster
    docs_index = docs_index[:5]
    docs_index_by_clusters.append(docs_index)
print(docs_index_by_clusters)    
    

[array([0, 4, 5, 7, 8]), array([ 919, 1227, 1345, 1349, 1351]), array([ 15,  34,  66, 175, 178]), array([ 2,  3,  6,  9, 10]), array([43, 47, 61, 64, 65]), array([348, 363, 504, 571, 765]), array([ 33, 101, 105, 121, 128]), array([ 1, 18, 20, 21, 22]), array([ 24, 452, 456, 461, 485]), array([447, 448, 549, 552, 572])]


In [106]:
#print the titles of documents belonging to each cluster to find similarity
for i in range(len(docs_index_by_clusters)):
    print("document titles belonging to cluster",i)
    print("----------------------------------------")
    docs_indexes = docs_index_by_clusters[i]
    for j in docs_indexes:
        print(df_real.iloc[j]['title'])
    print("-------------------------------------")    
        

document titles belonging to cluster 0
----------------------------------------
As U.S. budget fight looms, Republicans flip their fiscal script
Trump wants Postal Service to charge 'much more' for Amazon shipments
White House, Congress prepare for talks on spending, immigration
Factbox: Trump on Twitter (Dec 29) - Approval rating, Amazon
Trump on Twitter (Dec 28) - Global Warming
-------------------------------------
document titles belonging to cluster 1
----------------------------------------
North Korean defector pushes diplomatic solution in U.S. Congress
North Korea not ready to meet with South Korea in Russia: agencies
Turkey urges U.S. to review visa suspension as lira, stocks tumble
Turkey's Erdogan says U.S. decision to suspend visa services 'upsetting'
Turkey summons U.S. consulate worker for questioning: Anadolu
-------------------------------------
document titles belonging to cluster 2
----------------------------------------
Virginia officials postpone lottery drawing t

## There is some similarity between documents in each cluster. For example, docs in cluster 9 are about US opposition on Myanmar's ethnic cleansing, cluster 2 is about elections 