<a href="https://colab.research.google.com/github/VirajChetanDesai/FakeNewsDetection/blob/main/Fake_News_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Background code for N.A.T.E

In [None]:

import math
import os
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from torchtext.vocab import GloVe

import pickle

import requests, io, zipfile
# Download class resources...
r = requests.get("https://www.dropbox.com/s/2pj07qip0ei09xt/inspirit_fake_news_resources.zip?dl=1")
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

basepath = '.'

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

with open(os.path.join(basepath, 'train_val_data.pkl'), 'rb') as f:
  train_data, val_data = pickle.load(f)

print('Number of train examples:', len(train_data))
print('Number of val examples:', len(val_data))



Number of train examples: 2002
Number of val examples: 309


In [None]:
def get_description_from_html(html):
  soup = bs(html)
  description_tag = soup.find('meta', attrs={'name':'og:description'}) or soup.find('meta', attrs={'property':'description'}) or soup.find('meta', attrs={'name':'description'})
  if description_tag:
    description = description_tag.get('content') or ''
  else:
    description = ''
  return description

def scrape_description(url):
  if not url.startswith('http'):
    url = 'http://' + url
  response = requests.get(url, timeout=10)
  html = response.text
  description = get_description_from_html(html)
  return description

In [None]:
def get_descriptions_from_data(data):
  descriptions = []
  d=''
  for site in tqdm(data):
    descriptions.append(get_description_from_html(site[1]))
  return descriptions


train_descriptions = get_descriptions_from_data(train_data)
train_urls = [url for (url, html, label) in train_data]

  soup = bs(html)
100%|██████████| 2002/2002 [02:07<00:00, 15.68it/s]


In [None]:
val_descriptions = get_descriptions_from_data(val_data)

100%|██████████| 309/309 [00:19<00:00, 15.48it/s]


In [None]:
vectorizer = CountVectorizer(max_features=300)

vectorizer.fit(train_descriptions)

def vectorize_data_descriptions(descriptions, vectorizer):
  X = vectorizer.transform(descriptions).todense()
  return X

print('\nPreparing train data...')
bow_train_X = vectorize_data_descriptions(train_descriptions, vectorizer)
bow_train_y = [label for url, html, label in train_data]

print('\nPreparing val data...')
bow_val_X=vectorize_data_descriptions(val_descriptions, vectorizer)
bow_val_y=[label for url, html, label in val_data]


Preparing train data...

Preparing val data...


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = LogisticRegression()

# Directly convert to numpy arrays in the fit method call
model.fit(np.asarray(bow_train_X), bow_train_y)
train_y_pred = model.predict(np.asarray(bow_train_X))
print('Train accuracy:', accuracy_score(bow_train_y, train_y_pred))

# Directly convert to numpy arrays in the predict method call for validation set
val_y_pred = model.predict(np.asarray(bow_val_X))
print('Val accuracy:', accuracy_score(bow_val_y, val_y_pred))

print('')

# Compute precision, recall, and F-score for the positive class
prf = precision_recall_fscore_support(bow_val_y, val_y_pred)
print('Precision:', prf[0][1])  # Precision for the positive class
print('Recall:', prf[1][1])     # Recall for the positive class
print('F-Score:', prf[2][1])    # F-Score for the positive class


Train accuracy: 0.8746253746253746
Val accuracy: 0.6634304207119741

Precision: 0.5844748858447488
Recall: 0.9078014184397163
F-Score: 0.7111111111111111


In [None]:
VEC_SIZE = 300
glove = GloVe(name='6B', dim=VEC_SIZE)

def get_word_vector(word):
    try:
      return glove.vectors[glove.stoi[word.lower()]].numpy()
    except KeyError:
      return None

.vector_cache/glove.6B.zip: 862MB [02:38, 5.42MB/s]                           
100%|█████████▉| 399999/400000 [00:52<00:00, 7601.21it/s]


In [None]:
def glove_transform_data_descriptions(descriptions):
    X = np.zeros((len(descriptions), VEC_SIZE))
    for i, description in enumerate(descriptions):
        found_words = 0.0
        description = description.strip()
        for word in description.split():
            vec = get_word_vector(word)
            if vec is not None:
                # Increment found_words and add vec to X[i].
                found_words += 1
                X[i] += vec
        # divide the sum by the number of words added, so we have the
        # average word vector.
        if found_words > 0:
            X[i] /= found_words

    return X

glove_train_X = glove_transform_data_descriptions(train_descriptions)
glove_train_y = [label for (url, html, label) in train_data]

glove_val_X = glove_transform_data_descriptions(val_descriptions)
glove_val_y = [label for (url, html, label) in val_data]

In [None]:
model = LogisticRegression()

model.fit(glove_train_X, glove_train_y)

train_y_pred = model.predict(glove_train_X)
print('Train accuracy', accuracy_score(glove_train_y, train_y_pred))
val_y_pred = model.predict(glove_val_X)
print('Val accuracy', accuracy_score(glove_val_y, val_y_pred))

print('')

prf = precision_recall_fscore_support(glove_val_y, val_y_pred)
print('Precision:', prf[0][1])
print('Recall:', prf[1][1])
print('F-Score:', prf[2][1])

Train accuracy 0.8656343656343657
Val accuracy 0.7702265372168284

Precision: 0.7011494252873564
Recall: 0.8652482269503546
F-Score: 0.7746031746031746


In [None]:
def train_model(train_X, train_y, val_X, val_y):
  model = LogisticRegression(solver='liblinear')
  model.fit(train_X, train_y)

  return model


def train_and_evaluate_model(train_X, train_y, val_X, val_y):
  model = train_model(train_X, train_y, val_X, val_y)


  train_y_pred = model.predict(train_X)
  print('Train accuracy', accuracy_score(train_y, train_y_pred))
  val_y_pred = model.predict(val_X)
  print('Val accuracy', accuracy_score(val_y, val_y_pred))

  print('')

  prf = precision_recall_fscore_support(val_y, val_y_pred)
  print('Precision:', prf[0][1])
  print('Recall:', prf[1][1])
  print('F-Score:', prf[2][1])


  return model

In [None]:
def prepare_data(data, featurizer):
    X = []
    y = []
    for datapoint in data:
        url, html, label = datapoint
        # convert all text in HTML to lowercase, so <p>Hello.</p> is mapped to
        # <p>hello</p>. This will help us later when we extract features from
        # the HTML, as we will be able to rely on the HTML being lowercase.
        html = html.lower()
        y.append(label)

        features = featurizer(url, html)

        # Gets the keys of the dictionary as descriptions, gets the values
        # as the numerical features. Don't worry about exactly what zip does!
        feature_descriptions, feature_values = zip(*features.items())

        X.append(feature_values)

    return X, y, feature_descriptions

# Gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword
# to lowercase).
def get_normalized_count(html, phrase):
    return math.log(1 + html.count(phrase.lower()))

# Returns a dictionary mapping from plaintext feature descriptions to numerical
# features for a (url, html) pair.
def keyword_featurizer(url, html):
    features = {}

    # Same as before.
    features['.com domain'] = url.endswith('.com')
    features['.org domain'] = url.endswith('.org')
    features['.net domain'] = url.endswith('.net')
    features['.info domain'] = url.endswith('.info')
    features['.org domain'] = url.endswith('.org')
    features['.biz domain'] = url.endswith('.biz')
    features['.ru domain'] = url.endswith('.ru')
    features['.co.uk domain'] = url.endswith('.co.uk')
    features['.co domain'] = url.endswith('.co')
    features['.tv domain'] = url.endswith('.tv')
    features['.ac domain'] = url.endswith('.ac')
    features['.ml domain'] = url.endswith('.ml')
    features['.edu domain'] = url.endswith('edu.')



    keywords = ['federal','<ins','potato','trump','ww1','iframe', '<video','prayer', '<source','googlesyndication','client','<audio' ,'biden', 'clinton','sports', 'finance','awesome','high','corruption','fake news','opinion','memes','instagram','riots','save','shortcut','Rahul Gandhi','modi','lower','gotta','gimme','fact','god','holy','game','clinton','jesus','podesta','infowar','bummer','<i>','AdsbyGoogle','Advertisement','senate','whatsapp','feminism','pope','facebook','legalization','wall','weed','dogs','dog','nuclear','war','president','stupid','facebook','sold','drugs','disease','dumb','retard','asshole','comments','comment','help','Tracker','superior','link','fb','finest','nazi','jew','obama','christians','muslim','muslims','claims']

    for keyword in keywords:
      features[keyword + ' keyword'] = get_normalized_count(html, keyword)


    return features

keyword_train_X, train_y, _ = prepare_data(train_data, keyword_featurizer)
keyword_val_X, val_y, _ = prepare_data(val_data, keyword_featurizer)
train_and_evaluate_model(keyword_train_X, train_y, keyword_val_X, val_y)


'''
combined_train_X = combine_features([keyword_train_X, bow_train_X])
combined_val_X = combine_features([keyword_val_X, bow_val_X])
train_and_evaluate_model(combined_train_X, train_y, combined_val_X, val_y)

'''



Train accuracy 0.9365634365634365
Val accuracy 0.8705501618122977

Precision: 0.8258064516129032
Recall: 0.9078014184397163
F-Score: 0.8648648648648649


'\ncombined_train_X = combine_features([keyword_train_X, bow_train_X])\ncombined_val_X = combine_features([keyword_val_X, bow_val_X])\ntrain_and_evaluate_model(combined_train_X, train_y, combined_val_X, val_y)\n\n'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming train_descriptions and val_descriptions are defined elsewhere
vectorizer = CountVectorizer(max_features=300)
vectorizer.fit(train_descriptions)

def vectorize_data_descriptions(data_descriptions, vectorizer):
    X = vectorizer.transform(data_descriptions).todense()
    return np.asarray(X)  # Convert matrix to ndarray

# Vectorize the data
bow_train_X = vectorize_data_descriptions(train_descriptions, vectorizer)
bow_val_X = vectorize_data_descriptions(val_descriptions, vectorizer)

# Assuming train_and_evaluate_model and train_y, val_y are defined elsewhere
train_and_evaluate_model(bow_train_X, train_y, bow_val_X, val_y)


Train accuracy 0.8746253746253746
Val accuracy 0.6634304207119741

Precision: 0.5844748858447488
Recall: 0.9078014184397163
F-Score: 0.7111111111111111


In [None]:
VEC_SIZE = 300
glove = GloVe(name='6B', dim=VEC_SIZE)

def get_word_vector(word):
    try:
      return glove.vectors[glove.stoi[word.lower()]].numpy()
    except KeyError:
      return None

def glove_transform_data_descriptions(descriptions):
    X = np.zeros((len(descriptions), VEC_SIZE))
    for i, description in enumerate(descriptions):
        found_words = 0.0
        description = description.strip()
        for word in description.split():
            vec = get_word_vector(word)
            if vec is not None:
                # Increment found_words and add vec to X[i].
                found_words += 1
                X[i] += vec
        # divide the sum by the number of words added, so we have the
        # average word vector.
        if found_words > 0:
            X[i] /= found_words

    return X



glove_train_X = glove_transform_data_descriptions(train_descriptions)
glove_val_X = glove_transform_data_descriptions(val_descriptions)

train_and_evaluate_model(glove_train_X, train_y, glove_val_X, val_y)


Train accuracy 0.8656343656343657
Val accuracy 0.7702265372168284

Precision: 0.7011494252873564
Recall: 0.8652482269503546
F-Score: 0.7746031746031746


In [None]:
def combine_features(X_list):
  return np.concatenate(X_list, axis=1)

combined_train_X = combine_features([keyword_train_X, bow_train_X, glove_train_X])
combined_val_X = combine_features([keyword_val_X, bow_val_X, glove_val_X])

model = train_and_evaluate_model(combined_train_X, train_y, combined_val_X, val_y)


Train accuracy 0.9560439560439561
Val accuracy 0.889967637540453

Precision: 0.8451612903225807
Recall: 0.9290780141843972
F-Score: 0.8851351351351351


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer()
vectorizer.fit(train_descriptions)

def vectorize_data_descriptions(descriptions, vectorizer):
    X = vectorizer.transform(descriptions).todense()
    return np.asarray(X)

print('\nPreparing train data...')
bow_train_X = vectorize_data_descriptions(train_descriptions, vectorizer)
bow_train_y = [label for url, html, label in train_data]

print('\nPreparing val data...')
bow_val_X = vectorize_data_descriptions(val_descriptions, vectorizer)
bow_val_y = [label for url, html, label in val_data]

train_and_evaluate_model(bow_train_X, bow_train_y, bow_val_X, bow_val_y)



Preparing train data...

Preparing val data...
Train accuracy 0.8761238761238761
Val accuracy 0.7087378640776699

Precision: 0.6231884057971014
Recall: 0.9148936170212766
F-Score: 0.7413793103448276


In [None]:
def prepare_data(data, featurizer):
    X = []
    y = []
    for datapoint in data:
        url, html, label = datapoint
        # We convert all text in HTML to lowercase, so <p>Hello.</p> is mapped to
        # <p>hello</p>. This will help us later when we extract features from
        # the HTML, as we will be able to rely on the HTML being lowercase.
        html = html.lower()
        y.append(label)

        features = featurizer(url, html)

        # Gets the keys of the dictionary as descriptions, gets the values
        # as the numerical features. Don't worry about exactly what zip does!
        feature_descriptions, feature_values = zip(*features.items())

        X.append(feature_values)

    return X, y, feature_descriptions

# Gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword
# to lowercase).
def get_normalized_count(html, phrase):
    return math.log(1 + html.count(phrase.lower()))

# Returns a dictionary mapping from plaintext feature descriptions to numerical
# features for a (url, html) pair.
def keyword_featurizer(url, html):
    features = {}

    # Same as before.
    features['.com domain'] = url.endswith('.com')
    features['.org domain'] = url.endswith('.org')
    features['.net domain'] = url.endswith('.net')
    features['.info domain'] = url.endswith('.info')
    features['.org domain'] = url.endswith('.org')
    features['.biz domain'] = url.endswith('.biz')
    features['.ru domain'] = url.endswith('.ru')
    features['.co.uk domain'] = url.endswith('.co.uk')
    features['.co domain'] = url.endswith('.co')
    features['.tv domain'] = url.endswith('.tv')
    features['.news domain'] = url.endswith('.news')
    features['.ac domain'] = url.endswith('.ac')
    features['.ml domain'] = url.endswith('.ml')
    features['.edu domain'] = url.endswith('edu.')


    keywords = ['federal','wtf','marijuania','please','<ins','potato','trump','ww1','iframe', '<video','prayer', '<source','googlesyndication','client','<audio' ,'biden', 'clinton','sports', 'finance','awesome','high','corruption','fake news','opinion','memes','instagram','riots','save','shortcut','Rahul Gandhi','modi','lower','gotta','gimme','fact','god','holy','game','clinton','jesus','podesta','infowar','bummer','<i>','AdsbyGoogle','Advertisement','senate','whatsapp','feminism','pope','facebook','legalization','wall','weed','dogs','dog','nuclear','war','president','stupid','facebook','sold','drugs','disease','dumb','retard','asshole','comments','comment','help','Tracker','superior','link','fb','finest','nazi','jew','obama','christians','muslim','muslims','claims']

    for keyword in keywords:
      features[keyword + ' keyword'] = get_normalized_count(html, keyword)


    return features

keyword_train_X, train_y, _ = prepare_data(train_data, keyword_featurizer)
keyword_val_X, val_y, _ = prepare_data(val_data, keyword_featurizer)
'''
train_and_evaluate_model(keyword_train_X, train_y, keyword_val_X, val_y)
'''


combined_train_X = combine_features([keyword_train_X, bow_train_X])
combined_val_X = combine_features([keyword_val_X, bow_val_X])
train_and_evaluate_model(combined_train_X, train_y, combined_val_X, val_y)



Train accuracy 0.957042957042957
Val accuracy 0.9061488673139159

Precision: 0.8733333333333333
Recall: 0.9290780141843972
F-Score: 0.9003436426116839


In [None]:
#@title Live Fake News Classification Demo { run: "auto", vertical-output: true, display-mode: "both" }

def get_data_pair(url):
  if not url.startswith('http'):
      url = 'http://' + url
  url_pretty = url
  if url_pretty.startswith('http://'):
      url_pretty = url_pretty[7:]
  if url_pretty.startswith('https://'):
      url_pretty = url_pretty[8:]

  # Scrape website for HTML
  response = requests.get(url, timeout=10)
  htmltext = response.text

  return url_pretty, htmltext

curr_url = "google.com" #@param {type:"string"}

url, html = get_data_pair(curr_url)

# Call on the output of *keyword_featurizer* or something similar
# to transform it into a format that allows for concatenation. See
# example below.
def dict_to_features(features_dict):
  X = np.array(list(features_dict.values())).astype('float')
  X = X[np.newaxis, :]
  return X
def featurize_data_pair(url, html):
  # Approach 1.
  keyword_X = dict_to_features(keyword_featurizer(url, html))


  # Approach 2.
  description = get_description_from_html(html)

  bow_X = vectorize_data_descriptions([description], vectorizer)

  # Approach 3.
  '''
  glove_X = glove_transform_data_descriptions([description])
  '''
  X = combine_features([keyword_X, bow_X])

  return X

curr_X = featurize_data_pair(url, html)

model = train_model(combined_train_X, train_y, combined_val_X, val_y)

curr_y = model.predict(curr_X)[0]




if curr_y < 0.5 :
  print(curr_url, 'appears to be real.')
else:
  print(curr_url, 'appears to be fake.')


google.com appears to be real.


In [None]:

with open(os.path.join(basepath, 'test_data.pkl'), 'rb') as f:
  test_data = pickle.load(f)

model = train_model(combined_train_X, train_y, combined_val_X, val_y)

print('Loading test data...')
test_X = []
for url, html, label in test_data:
  curr_X = np.array(featurize_data_pair(url, html))
  test_X.append(curr_X[0])

test_X = np.array(test_X)

test_y = [label for url, html, label in test_data]

print('Done loading test data...')

test_y_pred = model.predict(test_X)

print('Test accuracy', accuracy_score(test_y, test_y_pred))

print('Confusion matrix:')
print(confusion_matrix(test_y, test_y_pred))

prf = precision_recall_fscore_support(test_y, test_y_pred)

print('Precision:', prf[0][1])
print('Recall:', prf[1][1])
print('F-Score:', prf[2][1])

Loading test data...
Done loading test data...
Test accuracy 0.7886178861788617
Confusion matrix:
[[ 83  51]
 [  1 111]]
Precision: 0.6851851851851852
Recall: 0.9910714285714286
F-Score: 0.8102189781021897


# Runtime code for N.A.T.E

Example
https://www.dazeddigital.com/life-culture/article/60896/1/lab-tests-prove-that-the-alien-corpses-are-real-if-you-want-them-to-be



In [None]:
#@title :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: N.A.T.E :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: { run: "auto", vertical-output: true, display-mode: "form" }

import time
import sys
'''
def my_except_hook(exctype, value, traceback):
        print('There has been an error in the system')
sys.excepthook = my_except_hook
'''
print('Hello I am N.A.T.E')
print('I am here to help you discern fake news websites from real news websites...')
time.sleep(2)
print('')
print('Copy and paste the website url here!')



def get_data_pair(url):
  if not url.startswith('http'):
      url = 'http://' + url
  url_pretty = url
  if url_pretty.startswith('http://'):
      url_pretty = url_pretty[7:]
  if url_pretty.startswith('https://'):
      url_pretty = url_pretty[8:]
  # Scrape website for HTML
  response = requests.get(url, timeout=10)
  htmltext = response.text

  return url_pretty, htmltext

curr_url = input(str(' '))
print('')
url, html = get_data_pair(curr_url)

# Call on the output of *keyword_featurizer* or something similar
# to transform it into a format that allows for concatenation. See
# example below.
def dict_to_features(features_dict):
  X = np.array(list(features_dict.values())).astype('float')
  X = X[np.newaxis, :]
  return X
def featurize_data_pair(url, html):
  # Approach 1.
  keyword_X = dict_to_features(keyword_featurizer(url, html))


  # Approach 2.
  description = get_description_from_html(html)

  bow_X = vectorize_data_descriptions([description], vectorizer)

  # Approach 3.
  '''
  glove_X = glove_transform_data_descriptions([description])
  '''
  X = combine_features([keyword_X, bow_X])

  return X

curr_X = featurize_data_pair(url, html)

model = train_model(combined_train_X, train_y, combined_val_X, val_y)

curr_y = model.predict(curr_X)[0]

print('processing...')
time.sleep(0.5)
print('')
print('As far as I can detect the content in the url: ')

if curr_y < 0.5 :
  print(curr_url, 'appears to be real.')
else:
  print(curr_url, 'appears to be fake.')

time.sleep(3)
print('')
print('                           Always stay vigilant for fake news...')
time.sleep(1)
print('                                 And for now... Goodbye!')

Hello I am N.A.T.E
I am here to help you discern fake news websites from real news websites...

Copy and paste the website url here!
 https://www.dazeddigital.com/life-culture/article/60896/1/lab-tests-prove-that-the-alien-corpses-are-real-if-you-want-them-to-be

processing...

As far as I can detect the content in the url: 
https://www.dazeddigital.com/life-culture/article/60896/1/lab-tests-prove-that-the-alien-corpses-are-real-if-you-want-them-to-be appears to be fake.

                           Always stay vigilant for fake news...
                                 And for now... Goodbye!
