In [135]:
import numpy as np
import pandas as pd
import nltk
import re, string
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity



In [119]:
#reading the fake and true datasets
fake_news = pd.read_csv('D:\\bits\\sem4\\project_dataset\\Fake.csv')
true_news = pd.read_csv('D:\\bits\\sem4\\project_dataset\\True.csv')

#Target variable for fake news
fake_news['output']=0

#Target variable for true news
true_news['output']=1
#Concatenating and dropping for fake news
fake_news['news']=fake_news['title']+fake_news['text']
fake_news=fake_news.drop(['title', 'text'], axis=1)

#Concatenating and dropping for true news
true_news['news']=true_news['title']+true_news['text']
true_news=true_news.drop(['title', 'text'], axis=1)

#Rearranging the columns
fake_news = fake_news[['subject', 'date', 'news','output']]
true_news = true_news[['subject', 'date', 'news','output']]

#Removing links and the headline from the date column
fake_news=fake_news[~fake_news.date.str.contains("http")]
fake_news=fake_news[~fake_news.date.str.contains("HOST")]

#Converting the date to datetime format
fake_news['date'] = pd.to_datetime(fake_news['date'])
true_news['date'] = pd.to_datetime(true_news['date'])

#combine all news
frames = [fake_news, true_news]
news_dataset = pd.concat(frames)
news_dataset

#Creating a copy 
clean_news=news_dataset.copy()

# define punctuation
punctuations = '''!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'''

''

def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(punctuations), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('mr.', 'mr', text)
    text = re.sub('u.s.', 'usa', text)
    return text

In [120]:
clean_news['news']=clean_news['news'].apply(lambda x:review_cleaning(x))
clean_news.head()

Unnamed: 0,subject,date,news,output
0,News,2017-12-31,donald trump sends out embarrassing new year’...,0
1,News,2017-12-31,drunk bragging trump staffer started rusaan c...,0
2,News,2017-12-30,sheriff david clarke becomes an internet joke...,0
3,News,2017-12-29,trump is so obsessed he even has obama’s name...,0
4,News,2017-12-25,pope francis just called out donald trump dur...,0


In [121]:
sentence = list(clean_news['news'][1])

In [122]:
sentence

[' drunk bragging trump staffer started rusaan collusion investigationhouse intelligence committee chairman devin nunes is going to have a bad day. he s been under the assumption like many of us that the christopher steeledossier was what prompted the rusaa investigation so he s been lashing out at the department of justice and the fbi in order to protect trump. as it happens the dossier is not what started the investigation according to documents obtained by the new york times.former trump campaign adviser george papadopoulos was drunk in a wine bar when he revealed knowledge of rusaan opposition research on hillary clinton.on top of that papadopoulos wasn t just a covfefe boy for trump as his administration has alleged. he had a much larger role but none so damning as being a drunken fool in a wine bar. coffee boys  don t help to arrange a new york meeting between trump and president abdel fattah elsisi of egypt two months before the election. it was known before that the former aide

In [123]:
sentences = []
for s in clean_news['news'][1]:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [124]:
sentences

[' drunk bragging trump staffer started rusaan collusion investigationhouse intelligence committee chairman devin nunes is going to have a bad day.', 'he s been under the assumption like many of us that the christopher steeledossier was what prompted the rusaa investigation so he s been lashing out at the department of justice and the fbi in order to protect trump.', 'as it happens the dossier is not what started the investigation according to documents obtained by the new york times.former trump campaign adviser george papadopoulos was drunk in a wine bar when he revealed knowledge of rusaan opposition research on hillary clinton.on top of that papadopoulos wasn t just a covfefe boy for trump as his administration has alleged.', 'he had a much larger role but none so damning as being a drunken fool in a wine bar.', 'coffee boys  don t help to arrange a new york meeting between trump and president abdel fattah elsisi of egypt two months before the election.', 'it was known before that 

In [127]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")


  


In [129]:
clean_sentences[1]

'he s been under the assumption like many of us that the christopher steeledossier was what prompted the rusaa investigation so he s been lashing out at the department of justice and the fbi in order to protect trump '

In [130]:
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [132]:
clean_sentences[1]

'assumption like many us christopher steeledossier prompted rusaa investigation lashing department justice fbi order protect trump'

In [133]:
# Extract word vectors
word_embeddings = {}
f = open('D:\\bits\\sem4\\project_dataset\\glove.6B.50d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [134]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((50,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((50,))
  sentence_vectors.append(v)

In [137]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,50), sentence_vectors[j].reshape(1,50))[0,0]

In [138]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [141]:
ranked_sentences

[(0.033444490016717406, 'usa military to accept transgender recruits on monday pentagonwashington reuters  transgender people will be allowed for the first time to enlist in the usa military starting on monday as ordered by federal courts the pentagon said on friday after president donald trump’s administration decided not to appeal rulings that blocked his transgender ban.'), (0.03321784561799195, 'trump said on twitter at the time that the military “cannot be burdened with the tremendous medical costs and disruption that transgender in the military would entail.” four federal judges  in baltimore washington d.c. seattle and riverside california  have issued rulings blocking trump’s ban while legal challenges to the republican president’s policy proceed.'), (0.033211447098183064, 'two federal appeals courts one in washington and one in virginia last week rejected the administration’s requsa to put on hold orders by lower court judges requiring the military to begin accepting transgend

In [142]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
# Extract top 10 sentences as the summary
for i in range(3):
  print(ranked_sentences[i][1])

usa military to accept transgender recruits on monday pentagonwashington reuters  transgender people will be allowed for the first time to enlist in the usa military starting on monday as ordered by federal courts the pentagon said on friday after president donald trump’s administration decided not to appeal rulings that blocked his transgender ban.
trump said on twitter at the time that the military “cannot be burdened with the tremendous medical costs and disruption that transgender in the military would entail.” four federal judges  in baltimore washington d.c. seattle and riverside california  have issued rulings blocking trump’s ban while legal challenges to the republican president’s policy proceed.
two federal appeals courts one in washington and one in virginia last week rejected the administration’s requsa to put on hold orders by lower court judges requiring the military to begin accepting transgender recruits on jan. .
