In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
import ftfy
import re


In [2]:
#First three datasets come from https://www.kaggle.com/datasets/stevenpeutz/misinformation-fake-news-text-dataset-79k

In [3]:
fn1 = pd.read_csv('DataSet_Misinfo_FAKE.csv', index_col=0)

In [4]:
fn1.head()

Unnamed: 0,text
0,Donald Trump just couldn t wish all Americans ...
1,House Intelligence Committee Chairman Devin Nu...
2,"On Friday, it was revealed that former Milwauk..."
3,"On Christmas day, Donald Trump announced that ..."
4,Pope Francis used his annual Christmas Day mes...


In [5]:
#Adding a column to label it as real news or fake news as the datasets themselves were not labeled excpt for filename

In [6]:
fn1['real'] = 0

In [7]:
fn1.head(2)

Unnamed: 0,text,real
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0


In [8]:
fn1['text'][0][:300]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger a'

In [9]:
tn1 = pd.read_csv('DataSet_Misinfo_TRUE.csv', index_col=0)

In [10]:
tn1.head(2)

Unnamed: 0,text
0,The head of a conservative Republican faction ...
1,Transgender people will be allowed for the fir...


In [11]:
tn1['real'] = 1

In [12]:
tn1.head(2)

Unnamed: 0,text,real
0,The head of a conservative Republican faction ...,1
1,Transgender people will be allowed for the fir...,1


In [13]:
tn1['text'][0][:300]

'The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S.'

In [14]:
rus_news = pd.read_csv('EXTRA_RussianPropagandaSubset.csv', index_col=0)

In [15]:
rus_news.head(2)

Unnamed: 0,text
0,Ukraine has put itself in a situation when ext...
1,Regardless who was behind the recent attack on...


In [16]:
#Russian propaganda extra dataset from Kaggle, again fake and needs labelled

In [17]:
rus_news['real'] = 0

In [18]:
rus_news.head(2)

Unnamed: 0,text,real
0,Ukraine has put itself in a situation when ext...,0
1,Regardless who was behind the recent attack on...,0


In [19]:
rus_news['text'][0][:300]

'Ukraine has put itself in a situation when external forces dictate how it should solve its internal problems. Kyiv expected that its Western allies will put pressure on Russia in the Normandy Four format. However, the allies realised that their confrontation with Russia will cost them more. Emmanuel'

In [20]:
# Remaining datasets come from https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets

In [21]:
fn2 = pd.read_csv('FAKE.csv')

In [22]:
fn2.head(2)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


In [23]:
fn2['real'] = 0

In [24]:
fn2.head(2)

Unnamed: 0,title,text,subject,date,real
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0


In [25]:
fn2.isnull().values.any()

False

In [26]:
#As datasets have close duplicates, These should be removed

In [27]:
fn2['text'][0][:300]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger a'

In [28]:
tn2 = pd.read_csv('TRUE.csv')

In [29]:
tn2.head(2)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


In [30]:
tn2['real'] = 1

In [31]:
tn2.head(2)

Unnamed: 0,title,text,subject,date,real
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1


In [32]:
#Concerning that the datasets seem to have to first articles as nearly identical. 

In [33]:
tn2['text'][0][:300]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way '

In [34]:
#Combining the first three datasets as they have similar columns.

In [35]:
news_df = reduce(lambda  left,right: pd.merge(left,right,on=['text', 'real'],
                                            how='outer'), [fn1, tn1, rus_news])


In [36]:
#This checks the first three datasets for empty values. Surprisingly there are some. I will eliminate them

In [37]:
news_df.isnull().values.any()

True

In [38]:
null_counts = news_df.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

text    32
dtype: int64

In [39]:
news_df.dropna(inplace = True)


In [40]:
news_df.isnull().values.any()

False

In [41]:
#Inserting these columns for ease of the reduce method of joining dataframes. 

In [42]:
news_df['subject'] = np.nan
news_df['title'] = np.nan
news_df['date'] = np.nan

In [43]:
news_df.isnull().values.any()

True

In [44]:
news_df = news_df.fillna('None')

In [45]:
#Combining all the data

In [46]:
news_df = reduce(lambda  left,right: pd.merge(left,right,on=['text', 'real', 'subject', 'title', 'date'],
                                            how='outer'), [news_df, tn2, fn2])

In [47]:
news_df.head(5)

Unnamed: 0,text,real,subject,title,date
0,Donald Trump just couldn t wish all Americans ...,0,,,
1,House Intelligence Committee Chairman Devin Nu...,0,,,
2,"On Friday, it was revealed that former Milwauk...",0,,,
3,"On Christmas day, Donald Trump announced that ...",0,,,
4,Pope Francis used his annual Christmas Day mes...,0,,,


In [48]:
#Obviously there are duplicates, what might be a good cut off to eliminate the duplicates. Also are the datasets basically the
#same? Might be a limitation on Fake News training in general. 
#Partial was chosen as topics may be similar and reword certain ideas, but different news sources. 

In [49]:
#Traditional fuzzy matching was taking too long and 
#code adapted from https://towardsdatascience.com/fuzzy-matching-at-scale-84f2bfd0c536 and
#https://bergvca.github.io/2017/10/14/super-fast-string-matching.html
#citation for ftfy: Robyn Speer. (2021). ftfy (Version 6.0).


In [50]:
def ngrams(string, n = 3):
    string = ftfy.fix_text(string)
    string = string.encode('ascii', errors='ignore').decode()
    string = string.lower()
    chars_to_remove = [')',')','.','|','[',']','{','}',"'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',',' ')
    string = string.replace('-', ' ')
    string = string.replace('\n', '')
    string = string.title()
    string = re.sub(' +',' ',string).strip()
    string = ' '+ string +' '
    #string = re.sub(r'[,-./]|\sBD',r'', string) (test this later)
    ngrams = zip(*[string[i:40] for i in range (n)])
    return [''.join(ngram) for ngram in ngrams]

In [51]:
news_arts = news_df['text'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer = ngrams)
tf_idf_matrix = vectorizer.fit_transform(news_arts)

In [52]:
#print(tf_idf_matrix)

In [53]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [54]:
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(),1000,.9)


In [55]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [56]:
#Nothing below 1.0 seems to be an exact match, lets look at it sorted

In [57]:
matches_df = get_matches_df(matches, news_df['text'], top=1000)
matches_df = matches_df[matches_df['similairity'] < 0.99999]
matches_df.sample(20)

Unnamed: 0,left_side,right_side,similairity
87,Former reality show star Donald Trump has repe...,Former reality show star Donald Trump launched...,0.941648
573,Former Attorney General Eric Holder responded ...,"Former Attorney General Eric Holder, who serve...",0.932853
72,Former reality show star Donald Trump just can...,Former reality show star Donald Trump has alwa...,0.943554
6,House Intelligence Committee Chairman Devin Nu...,"The private investigator, Robert Mueller, had ...",0.952208
247,White House Chief of Staff John Kelly is a Fou...,"A Birmingham, Alabama ABC affiliate sought out...",0.978108
921,It doesn t take a rocket scientist to know tha...,It doesn t take a rocket scientist to figure o...,0.962476
249,White House Chief of Staff John Kelly is a Fou...,They call and promise (to join the European Un...,0.911876
685,Donald Trump s conflicts of interest were of m...,Donald Trump s conflicts of interest are final...,0.951586
241,White House Chief of Staff John Kelly is a Fou...,DUBAI (Reuters) - Iran said on Wednesday it ha...,0.978108
245,White House Chief of Staff John Kelly is a Fou...,-Onions -Red peppers \nNotice they are all hea...,0.978108


In [58]:
#It seems that we can reasonably assume that less than 1.0 are not duplicates

In [59]:
matches_df.sort_values(['similairity'], ascending=False).head(10)


Unnamed: 0,left_side,right_side,similairity
540,Milwaukee County Sheriff David Clarke seems li...,NATO intends to turn the entire country of Mon...,0.989785
539,Milwaukee County Sheriff David Clarke seems li...,Brussels and Washington accuse Russia of inter...,0.989785
247,White House Chief of Staff John Kelly is a Fou...,"A Birmingham, Alabama ABC affiliate sought out...",0.978108
241,White House Chief of Staff John Kelly is a Fou...,DUBAI (Reuters) - Iran said on Wednesday it ha...,0.978108
246,White House Chief of Staff John Kelly is a Fou...,Countering Hillary’s Coup With a Counter Coup ...,0.978108
245,White House Chief of Staff John Kelly is a Fou...,-Onions -Red peppers \nNotice they are all hea...,0.978108
244,White House Chief of Staff John Kelly is a Fou...,By LAURA MOWAT \nLocals are fuming because the...,0.978108
243,White House Chief of Staff John Kelly is a Fou...,BNI Store Nov 6 2016 Like a good little sharia...,0.978108
242,White House Chief of Staff John Kelly is a Fou...,The U.S. Justice Department on Monday said it ...,0.978108
5,House Intelligence Committee Chairman Devin Nu...,"If this keeps up, the Democrats will never win...",0.972838


In [60]:
#I am inclined for simplicity sake to remove all with a similarity of 1.0, but lets look at these first 5
#There are 226 'exact matches' much less than I was fearful of at the beginning
#But I have to remember that this is a sample of the dataset. And that this is potentially 25% of my data.
# Lets run the original 

In [61]:
matches_df = get_matches_df(matches, news_df['text'], top=1000)
matches_df = matches_df[matches_df['similairity'] ==1.0]
matches_df.sample(10)

Unnamed: 0,left_side,right_side,similairity
707,As her husband Donald Trump makes Twitter thre...,As her husband Donald Trump makes Twitter thre...,1.0
489,"Donald Trump, Jr. went before Congress today t...","Donald Trump, Jr. went before Congress today t...",1.0
505,Ever since Donald Trump rode down the escalato...,Ever since Donald Trump rode down the escalato...,1.0
319,Has anyone else noticed that the government so...,Has anyone else noticed that the government so...,1.0
794,No one takes Donald Trump seriously anymore. N...,No one takes Donald Trump seriously anymore. N...,1.0
718,There are no two crazy people ever mentioned m...,There are no two crazy people ever mentioned m...,1.0
110,Senator Jeff Flake (R-AZ) is liberated now tha...,ISTANBUL (Reuters) - U.S. Secretary of State R...,1.0
913,Because there s totally nothing else better fo...,Because there s totally nothing else better fo...,1.0
227,Donald Trump s Twitter habit is dangerous. Whe...,Donald Trump s Twitter habit is dangerous. Whe...,1.0
483,Donald Trump s administration finds new ways t...,Donald Trump s administration finds new ways t...,1.0


In [62]:
matches_df.head()

Unnamed: 0,left_side,right_side,similairity
9,"On Christmas day, Donald Trump announced that ...","On Christmas day, Donald Trump announced that ...",1.0
13,In the wake of yet another court decision that...,In the wake of yet another court decision that...,1.0
14,Many people have raised the alarm regarding th...,Many people have raised the alarm regarding th...,1.0
25,Senate Majority Whip John Cornyn (R-TX) though...,Senate Majority Whip John Cornyn (R-TX) though...,1.0
27,"In this #METOO moment, many powerful men are b...","In this #METOO moment, many powerful men are b...",1.0


In [63]:
matches_df.tail()

Unnamed: 0,left_side,right_side,similairity
984,Donald Trump just refuses to admit that most A...,Donald Trump just refuses to admit that most A...,1.0
988,Donald Trump and his vice-president are eager ...,Donald Trump and his vice-president are eager ...,1.0
990,Star Trek legend George Takei gloriously shred...,Star Trek legend George Takei gloriously shred...,1.0
993,"By now, everyone is used to the rank ignorance...","By now, everyone is used to the rank ignorance...",1.0
994,We all know that the GOP is the party of anti-...,We all know that the GOP is the party of anti-...,1.0


In [64]:
#It seems reasonable to assume that anything identified as similarity of 1.0 is indeed a duplicate. 
#Now to remove them. 
#I have to be careful as these were a sample of my datasets. 
#Let's confirm I don't have more than 8000 duplicates

In [65]:
matches_df = get_matches_df(matches, news_df['text'], top=8000)
matches_df = matches_df[matches_df['similairity'] ==1.0]
matches_df.reset_index(inplace = True)

In [66]:
if matches_df['left_side'][0] == news_df['text'][0]:
    print(True)

In [67]:
#2278 out of 130274 are duplicates
#Less than 2 percent, that is perfectly reasonable given that we are working with a few data sets. 
#Let's remove these from news_df

In [68]:
news_df_s = news_df.sample(n = 1000)
news_df_s.reset_index(inplace = True)


In [69]:
x = len(matches_df)
dups = []
for i in range(1000):
    for j in range(x):
        if news_df_s['text'][i] is matches_df['left_side'][j]:
            print(i,j)
            dups.append(i)


36 1324
98 1672
123 138
136 939
281 1601
462 820
565 1624
664 1037


In [70]:
news_df_s['text'][59][:100]

'OSLO (Reuters) - Backers of a global accord to fight climate change that formally comes into force o'

In [71]:
matches_df['left_side'][1005][:100]

'Following a Boston judge s refusal to renew the temporary stay on Trump s Muslim ban, a federal judg'

In [72]:
news_df_s['text'][79][:100]

'Hillary s such a champion of gay rights that her campaign is being funded by countries who throw gay'

In [73]:
matches_df['left_side'][500][:100]

'White House press secretary Sean Spicer literally denied the meaning of alleged president Donald Tru'

In [74]:
news_df_s['text'][436][:100]

'Republican presidential candidates are attacking President Obama’s plan to use his Oval Office power'

In [75]:
matches_df['left_side'][1156][:100]

'Israeli Prime Minister Benjamin Netanyahu threw a temper tantrum like a petulant child because the U'

In [76]:
news_df_s['text'][489][:100]

'The UN condemned the persecution of the Ukrainian Orthodox Church and concluded that there is a "rel'

In [77]:
#matches_df['left_side'][2087][:100]

In [78]:
news_df_s['text'][588][:100]

'WASHINGTON (Reuters) - A quest by Republicans to open Alaska’s Arctic National Wildlife Reserve was '

In [79]:
# This for loop seems to adequately find the indices of the duplicates lets do this and check on the full dataset. 


In [80]:
x = len(matches_df)
y = len(news_df)
verify = []
dups = []
for i in range(y):
    for j in range(x):
        if news_df['text'][i] is matches_df['left_side'][j]:
            verify.append(j)
            dups.append(i)

In [84]:
#verify is just the i of dups I want to look at some of the duplicates in dups e.g. 623

In [85]:
dups.index(623)

142

In [86]:
#code adapted from https://datagy.io/python-list-find-all-index/
def find_indices(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

print(find_indices(dups,623))

[142, 143, 144, 145, 146, 147, 148]


In [87]:
news_df['text'][623][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [88]:
matches_df['left_side'][142][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [89]:
matches_df['left_side'][143][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [90]:
matches_df['left_side'][144][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [91]:
matches_df['left_side'][145][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [92]:
matches_df['left_side'][146][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [93]:
matches_df['left_side'][147][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [94]:
matches_df['left_side'][148][:100]

'The current occupant of the White House has been quiet about his campaign promise to combat the opio'

In [95]:
#Everything seems to checkout so I am going to go ahead and drop the found duplicates

In [96]:
dups = list(set(dups))
len(dups)

1344

In [97]:
news_df.drop(index = dups, inplace = True)

In [98]:
news_df.to_pickle('News_df.pkl')