In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install psutil -q
! pip install pandarallel -q
! pip install -U sentence-transformers -q

# Importing Libraries

* psutil and pandarallel for parallel processing a dataframe
* tqdm for tracking processing
* Sentence transformer for generating embeddings
* Pandas and Numpy for dataframe processing

In [None]:
import psutil
workers = psutil.cpu_count()

# from pandarallel.utils import progress_bars
# progress_bars.is_notebook_lab = lambda : True

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=workers, use_memory_fs=False)

from tqdm.auto import tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer ,  util
import torch, spacy, string, re, pickle
from collections import Counter
import pandas as pd
import numpy as np

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# **Creating the Lexicon Using WELFake**

Defining cleaning/ preprocessing pipeline. 

In [None]:
def lower(text):
  return text.lower()

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_nonascii(sent):
  return "".join([i for i in sent if i.isascii()])

def remove_punctuations(text):
  res = re.sub(r'[^\w\s]', '', text)
  return res

def remove_num(text):
  return "".join([c for c in text if not c.isdigit()])

def remove_mul_space(text):
  return " ".join(text.split())

def clean(text):
  
  text = lower(text)
  text = remove_urls(text)
  text = remove_nonascii(text)
  text = remove_punctuations(text)
  text = remove_num(text)
  text = remove_mul_space(text)

  return text

Importing the WELFake dataset for lexicon creation

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/WELFake/WELFake_Simplified.csv")
df.info()

In [None]:
print("-----------------")
print("| Cleaning Data |")
print("-----------------")
print()

df_fake = df[df['label'] == 0].copy(deep=True)
df_true = df[df['label'] == 1].copy(deep=True)

print("True News")
print("=========")
df_true['total_text'] = df_true['total_text'].parallel_apply(clean)
print()
print("Fake News")
print("=========")
df_fake['total_text'] = df_fake['total_text'].parallel_apply(clean)
print()

print("Done !!!\n")

print("--------------------")
print("| Lemmatizing Data |")
print("--------------------")
print()

nlp_lemmatize = spacy.load("en", disable = ['parser', 'ner', 'tagger', 'textcat'])

print("True News")
print("=========")
df_true["total_text"] = df_true["total_text"].parallel_apply(lambda row: " ".join([w.lemma_ for w in nlp_lemmatize(row)]))
print()
print("Fake News")
print("=========")
df_fake["total_text"] = df_fake["total_text"].parallel_apply(lambda row: " ".join([w.lemma_ for w in nlp_lemmatize(row)]))
print()

print("Done !!!\n")

true_data = df_true['total_text'].tolist()
fake_data = df_fake['total_text'].tolist()

In [None]:
print('Total number of true articles is {}'.format(len(true_data)))
print('Total number of fake articles is {}'.format(len(fake_data)))

In [None]:
true_monograms, true_bigrams, true_trigrams = 0, 0, 0
fake_monograms, fake_bigrams, fake_trigrams = 0, 0, 0

for data in fake_data:
  words = len(data.split())
  fake_monograms += words
  fake_bigrams += words - 1
  fake_trigrams += words - 2

for data in true_data:
  words = len(data.split())
  true_monograms += words
  true_bigrams += words - 1
  true_trigrams += words - 2

In [None]:
def generateLexicon(fake_data, true_data, max_length, common, grams):

  print("---------------------")
  print("| Configuring SpaCy |")
  print("---------------------")
  print()
  
  try:

    import spacy
    nlp = spacy.load("en", disable = ['parser','ner', 'lemmatizer', 'textcat']) 
    nlp.max_length = max_length

    print("Done !!!\n")

    print("-------------------")
    print("| Generating Data |")
    print("-------------------")
    print()

    from tqdm import trange, tqdm

    stop = []
    with open("/content/drive/Shareddrives/FYP - knk/Resources/SMART_STOP_WORDS.txt", "r") as f:
      for word in f:
          # Here we remove the apostrophe as well
          stop.append(word.strip().replace("'",""))

    for i in trange(len(fake_data)):
      tokens = fake_data[i].split()
      tokens = [word for word in tokens if not word in stop]
      fake_data[i] = " ".join(tokens)

    for i in trange(len(true_data)):
      tokens = true_data[i].split()
      tokens = [word for word in tokens if not word in stop]
      true_data[i] = " ".join(tokens)

    print("\nDone !!!\n")

    print("----------------------------------")
    print("| Filtering out the Proper Nouns |")
    print("----------------------------------")
    print()

    fake_words, fake_sentences = [], []
    for doc in tqdm(nlp.pipe(fake_data, n_process=2, disable=["tok2vec", "parser", "ner", "tetcat", "attribute_ruler", "lemmatizer"]), total=len(fake_data)):
      sentence = []
      for token in doc:
        if(token.tag_ not in ['NNP', 'NNPS']):
          fake_words.append(str(token))
          sentence.append(str(token))
      fake_sentences.append(" ".join(sentence))

    true_words, true_sentences = [], []
    for doc in tqdm(nlp.pipe(true_data, n_process=2, disable=["tok2vec", "parser", "ner", "tetcat", "attribute_ruler", "lemmatizer"]), total=len(true_data)):
      sentence = []
      for token in doc:
        if(token.tag_ not in ['NNP', 'NNPS']):
          true_words.append(str(token))
          sentence.append(str(token))
      true_sentences.append(" ".join(sentence))

    print("\nDone !!!\n")

    from nltk.util import ngrams
    from collections import Counter

    word_lists = [fake_words, true_words]
    dictionary = {}

    print("----------------------------------")
    print("| Extracting and Scoring N-grams |")
    print("----------------------------------")
    print()

    for k in grams:

      if k == 1:

        fake_length, true_length = fake_monograms, true_monograms

        print("Monograms")
        print("=========")
        print()

      elif k == 2:

        fake_length, true_length = fake_bigrams, true_bigrams

        print("Bigrams")
        print("=======")
        print()

      elif k == 3:

        fake_length, true_length = fake_trigrams, true_trigrams

        print("Trigrams")
        print("========")
        print()

      for j in range (len(word_lists)):

        n_grams = ngrams(word_lists[j], k)
        c = Counter(n_grams).most_common(common)

        for i in trange(len(c)):
          item = c[i]
          if k != 1:
            b = " ".join(item[0])
          else:
            b = item[0][0]

          if b not in dictionary.keys():
            dictionary[b] = {}
          

          if j == 0:
            dictionary[b]['fake_occ_score'] = item[1] / float(fake_length)
            doc_score = 0
            for sent in fake_sentences:
              if b in sent: 
                doc_score += 1
            dictionary[b]['fake_doc_score'] = doc_score / float(len(fake_data))

          else:
            dictionary[b]['true_occ_score'] = item[1] / float(true_length)
            doc_score = 0
            for sent in true_sentences:
              if b in sent: 
                doc_score += 1
            dictionary[b]['true_doc_score'] = doc_score / float(len(true_data))

      if '-PRON-' in dictionary.keys():
        del dictionary['-PRON-']
          
      print("\nDone !!!\n")

    print("----------------------")
    print("| Creating Dataframe |")
    print("----------------------")
    print()

    import pandas as pd
    df = pd.DataFrame()
    words_final, fake_occ_scores, fake_doc_scores, true_occ_scores, true_doc_scores = [], [], [], [] ,[]
    for key in dictionary.keys():
      words_final.append(key)
      
      try:
        fake_occ_scores.append(dictionary[key]['fake_occ_score'])
      except KeyError:
        fake_occ_scores.append(0)
      try:
        fake_doc_scores.append(dictionary[key]['fake_doc_score'])
      except KeyError:
        fake_doc_scores.append(0)
      try:
        true_occ_scores.append(dictionary[key]['true_occ_score'])
      except KeyError:
        true_occ_scores.append(0)
      try:
        true_doc_scores.append(dictionary[key]['true_doc_score'])
      except KeyError:
        true_doc_scores.append(0)
    
    df['word'] = words_final
    df['true_occ_score'], df['true_doc_score'] = true_occ_scores, true_doc_scores
    df['fake_occ_score'], df['fake_doc_score'] = fake_occ_scores, fake_doc_scores

    print("\nDone !!!\n")

    print("-------------------")
    print("| Created Lexicon |")
    print("-------------------")
    print()

    print("Details")
    print("========")
    print()
    print(df.info())

    print("\nSample")
    print("=======")
    print()
    print(df.head(10))

    return df

  except ImportError:

    print("SpaCy is not installed/ cannot be found.")

In [None]:
MAX_DOC_LEN_IN_CHAR = 1000000
MOST_COMMON_WORDS = 1000
N_GRAMS = [1]

lexicon = generateLexicon(fake_data, true_data, MAX_DOC_LEN_IN_CHAR, MOST_COMMON_WORDS, N_GRAMS)