# Libraries and Functions

In [None]:
! pip install psutil -q
! pip install pandarallel -q
! pip install -U transformers -q

In [None]:
import psutil
workers = psutil.cpu_count()

# from pandarallel.utils import progress_bars
# progress_bars.is_notebook_lab = lambda : True

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=workers, use_memory_fs=False)
from transformers import AutoTokenizer, AutoModel, DistilBertTokenizerFast

from tqdm.auto import tqdm
tqdm.pandas()
from tqdm import trange 

import torch, spacy, string, re, pickle
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [None]:
def lower(text):
  return text.lower()

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_nonascii(sent):
  return "".join([i for i in sent if i.isascii()])

def remove_punctuations(text):
  res = re.sub(r'[^\w\s]', '', text)
  return res

def remove_num(text):
  return "".join([c for c in text if not c.isdigit()])

def remove_mul_space(text):
  return " ".join(text.split())

def clean(text):
  
  text = lower(text)
  text = remove_urls(text)
  text = remove_nonascii(text)
  # text = remove_punctuations(text)
  # text = remove_num(text)
  text = remove_mul_space(text)

  return text

def convert_label(label):
  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0
  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# model = AutoModel.from_pretrained(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/Fine-tuned Model Improved")
model = AutoModel.from_pretrained(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/Fine-tuned Model Improved V3")
model = model.to(device)

# Generate Embeddings - This function returns the embeddings of all the texts
def generate_embeddings(sentences, model, tokenizer):

  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)

  with torch.no_grad():
    model_output = model(**encoded_input)

  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  return sentence_embeddings.detach().cpu().numpy().tolist()

# Creating Lexicon

In [None]:
print("-----------------------")
print("| Reading the Lexicon |")
print("-----------------------")
print()

WELFAKE_MONOGRAMS = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Lexicons/WELFake_Lexicon_Monograms.csv")
WELFAKE_BIGRAMS_TRIGRAMS_WITH_STOP = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Lexicons/WELFake_Lexicon_Bigrams_Trigrams_With_Stop.csv")
WELFAKE_BIGRAMS_TRIGRAMS_WITHOUT_STOP = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Lexicons/WELFake_Lexicon_Bigrams_Trigrams_Without_Stop.csv")

COLING_MONGRAMS = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Lexicons/COLING_Lexicon_Monograms.csv")
COLING_BIGRAMS_TRIGRAMS_WITH_STOP = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Lexicons/COLING_Lexicon_Bigrams_Trigrams_With_Stop.csv")
COLING_BIGRAMS_TRIGRAMS_WITHOUT_STOP = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Lexicons/COLING_Lexicon_Bigrams_Trigrams_Without_Stop.csv")

lexicon = pd.concat([WELFAKE_MONOGRAMS, WELFAKE_BIGRAMS_TRIGRAMS_WITH_STOP], ignore_index=True)

print("Done !!!\n")

print("-----------------------")
print("| Creating Embeddings |")
print("-----------------------")
print()

EMBEDDING_RANGE = 300

all_words = lexicon['word'].tolist()
all_words_embeddings = []

for i in trange(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

lexicon['embedding'] = all_words_embeddings

print("\nDone !!!\n")

In [None]:
WELFAKE_MONOGRAMS.info()
WELFAKE_BIGRAMS_TRIGRAMS_WITH_STOP.info()

In [None]:
lexicon.info()

In [None]:
def removing_outliers(lexicon, algorithm):

  import matplotlib.pyplot as plt
  import seaborn as sns
  %matplotlib inline

  from sklearn.decomposition import PCA

  print("---------------------------")
  print("| Current Embedding Space |")
  print("---------------------------")
  print()

  pca = PCA(n_components=2)
  pca.fit(lexicon['embedding'].tolist())

  x_pca = pca.transform(lexicon['embedding'].tolist())

  plt.figure(figsize=(16,14))
  plt.scatter(x_pca[:,0],x_pca[:,1])

  plt.xlabel('First principal component')
  plt.ylabel('Second Principal Component')

  plt.show()

  print("Done !!!\n")

  print("---------------------")
  print("| Removing Outliers |")
  print("---------------------")
  print()

  data = lexicon['embedding'].tolist()

  if algorithm == "SVM":
    from sklearn.svm import OneClassSVM
    svm = OneClassSVM(nu=0.01)
    yhat = svm.fit_predict(data)

  elif algorithm == "IsolationForest":
    from sklearn.ensemble import IsolationForest
    # rs=np.random.RandomState(0)
    # iso = IsolationForest(max_samples=100,random_state=rs, contamination=.1) 
    iso = IsolationForest(contamination=0.1)
    yhat = iso.fit_predict(data)

  elif algorithm == "LocalOutlierFactor":
    from sklearn.neighbors import LocalOutlierFactor
    lof = LocalOutlierFactor()
    yhat = lof.fit_predict(data)

  elif algorithm == "EllipticEnvelope":
    from sklearn.covariance import EllipticEnvelope
    ee = EllipticEnvelope(contamination=0.01)
    yhat = ee.fit_predict(data)
  
  else:
    print("Algorithm not defined properly !!!")
    return "Error"


  mask = yhat != -1

  labels = []

  for i in range(len(mask)):
    if mask[i] == False:
      labels.append(i)

  df = lexicon.drop(labels=labels, axis=0)

  df.reset_index(inplace=True)

  print("Done !!!\n")

  print("The number of words in the Lexicon is {}".format(len(lexicon)))
  print("The number of outliers in the Lexicon is {}".format(len(labels)))
  print("Final number of words in the Lexicon is {}".format(len(df)))
  print()

  print("----------------------------")
  print("| Improved Embedding Space |")
  print("----------------------------")
  print()

  pca = PCA(n_components=2)
  pca.fit(df['embedding'].tolist())

  x_pca = pca.transform(df['embedding'].tolist())

  plt.figure(figsize=(16,14))
  plt.scatter(x_pca[:,0],x_pca[:,1])

  plt.xlabel('First principal component')
  plt.ylabel('Second Principal Component')

  print("Done !!!\n")

  return df

In [None]:
lexicon = removing_outliers(lexicon, "IsolationForest")

In [None]:
print("------------------------")
print("| Generate Annoy Graph |")
print("------------------------")
print()

print("Installing Annoy")
print("================")

! pip install annoy -q
from annoy import AnnoyIndex

print("\nDone !!!\n")

print("Creating the Annoy graph")
print("========================")

word_embeddings = lexicon['embedding'].tolist()

f = 768
ann = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i in range(len(word_embeddings)):
    ann.add_item(i, word_embeddings[i])

status = ann.build(1000) # 1000 trees

if (status == True):
  print("\nTree built successfully !!!\n")

else:
  print("\nTree was not built, ERROR !!!\n")

In [None]:
def get_scores(embedding):

  true, fake = 0, 0
  total_fake_doc, total_fake_occ = 0, 0
  total_true_doc, total_true_occ = 0, 0

  indexes, distances = ann.get_nns_by_vector(embedding, 100, include_distances=True, search_k=-1)
  
  count_true, count_fake = 0, 0

  for j in range(len(indexes)):

    fake_doc, true_doc = lexicon['fake_doc_score'][indexes[j]], lexicon['true_doc_score'][indexes[j]]
    fake_occ, true_occ = lexicon['fake_occ_score'][indexes[j]], lexicon['true_occ_score'][indexes[j]]
    
    fake_occ_this, true_occ_this = fake_occ * ((fake_occ) / (fake_occ + true_occ)), true_occ * ((true_occ) / (fake_occ + true_occ))
    fake_doc_this, true_doc_this = fake_doc * ((fake_doc) / (fake_doc + true_doc)), true_doc * ((true_doc) / (fake_doc + true_doc))

    total_fake_occ += fake_occ_this
    total_fake_doc += fake_doc_this
    
    total_true_occ += true_occ_this
    total_true_doc += true_doc_this
    

    if fake_doc_this != 0 and true_doc_this != 0:
      fake += (fake_occ_this / fake_doc_this) / distances[j]
      true += (true_occ_this / true_doc_this) / distances[j]

    elif fake_doc_this == 0 :
      count_true += 1
      true += (true_occ_this / true_doc_this) / (distances[j])

    else:
      count_fake += 1
      fake += (fake_occ_this / fake_doc_this) / (distances[j])

  return [fake*100/(fake+true), true*100/(fake+true), total_fake_occ, total_fake_doc, total_true_occ, total_true_doc]

# Analysis using LIAR

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/LIAR/Liar_all.csv")
df.info()

In [None]:
# try:
#   df = df.loc[df['split'] == 'test']
# except KeyError:
#   pass

df['cleaned_statement'] = df['statement'].apply(clean)

In [None]:
# df = df.drop_duplicates(subset=["cleaned_statement"]).reset_index(drop=True)

In [None]:
all_words = df['cleaned_statement'].tolist()
all_words_embeddings = []

EMBEDDING_RANGE = 200

for i in trange(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
df["scores"] = df["embedding"].progress_apply(get_scores)

In [None]:
df.to_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/Liar_with_WELFAKE_Lexicon_Scores_Modified.csv")

In [None]:
df.head()

# Analysis using CodaLab

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/CodaLab Covid/Constraint_English_All.csv")
df.info()

In [None]:
# try:
#   df = df.loc[df['split'] == 'test']
# except KeyError:
#   pass

df['cleaned_statement'] = df['tweet'].apply(clean)

In [None]:
# df = df.drop_duplicates(subset=["cleaned_statement"]).reset_index(drop=True)

In [None]:
all_words = df['cleaned_statement'].tolist()
all_words_embeddings = []

EMBEDDING_RANGE = 100

for i in trange(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
df["scores"] = df["embedding"].progress_apply(get_scores)

In [None]:
df.to_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab_with_WELFAKE_Lexicon_Scores_Modified.csv")

In [None]:
df.head()

# Analysis using Kaggle RealFake

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Kaggle_real_fake/fake_or_real_news.csv")
df.info()

In [None]:
# try:
#   df = df.loc[df['split'] == 'test']
# except KeyError:
#   pass

df['total_text'] = df['title']+ ' ' + df['text']

df['cleaned_statement'] = df['total_text'].apply(clean)

In [None]:
# df = df.drop_duplicates(subset=["cleaned_statement"]).reset_index(drop=True)

In [None]:
all_words = df['cleaned_statement'].tolist()
all_words_embeddings = []

EMBEDDING_RANGE = 200

for i in trange(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
df["scores"] = df["embedding"].progress_apply(get_scores)

In [None]:
df.to_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_with_WELFAKE_Lexicon_Scores_Modified.csv")

In [None]:
df.head()

# Analysis using FakeNewsNet

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FakeNewsNet/FakeNewsNet_All.csv")
df.info()

In [None]:
# try:
#   df = df.loc[df['split'] == 'test']
# except KeyError:
#   pass


df['cleaned_statement'] = df['title'].apply(clean)

In [None]:
# df = df.drop_duplicates(subset=["cleaned_statement"]).reset_index(drop=True)

In [None]:
all_words = df['cleaned_statement'].tolist()
all_words_embeddings = []

EMBEDDING_RANGE = 300

for i in trange(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
df["scores"] = df["embedding"].progress_apply(get_scores)

In [None]:
df.to_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_with_WELFAKE_Lexicon_Scores_Modified.csv")

In [None]:
df.head()

# Analysis using ISOT

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/ISOT/ISOT.csv")
df.info()

In [None]:
# try:
#   df = df.loc[df['split'] == 'test']
# except KeyError:
#   pass

df['total_text'] = df['title']+ ' ' + df['text']

df['cleaned_statement'] = df['total_text'].apply(clean)

In [None]:
# df = df.drop_duplicates(subset=["cleaned_statement"]).reset_index(drop=True)

In [None]:
all_words = df['cleaned_statement'].tolist()
all_words_embeddings = []

EMBEDDING_RANGE = 150

for i in trange(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
df["scores"] = df["embedding"].progress_apply(get_scores)

In [None]:
df.to_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_with_WELFAKE_Lexicon_Scores_Modified.csv")

In [None]:
df.head()

In [None]:
df.info()