# framework imports and install



In [None]:
!pip install transformers==4.10.2
!pip install sentencepiece==0.1.96
!pip install rouge
!pip install langid
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import requests
import json
import operator
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
from tqdm import tqdm
from huggingface_hub import notebook_login
from tqdm import tqdm


In [None]:
dir = '/content/drive/MyDrive/working_directory/tfm/data'
hoaxdf = pd.read_excel(dir+'/Cancer_en.xlsx')

In [None]:
hoaxdf

Unnamed: 0,"Fact checker, institution",Normalized Hoax-checked,Original hoax Text from source or article tittle
0,PolitiFact,huge spike in soft-tissue cancers diagnoses in...,Soft-tissue cancer diagnoses have “climbed thr...
1,PolitiFact,Cancer institute finally admits marijuana kill...,"""Cancer institute finally admits marijuana kil..."
2,PolitiFact,No one has died of cancer or heart disease sin...,“No one has died of cancer or heart disease si...
3,PolitiFact,80% of children born in developing countries d...,"""80% of children born in developing countries ..."
4,PolitiFact,Cutting out sugar and drinking hot lemon water...,Says cutting out sugar and drinking hot lemon ...
5,PolitiFact,Abortion increases the risk of breast cancer.,Says abortion increases the risk of breast can...
6,PolitiFact,The flu shot is designed to spread cancer.,"The flu shot is ""designed to spread cancer."""
7,PolitiFact,Dandelion root is able to kill 98% of cancer c...,"""Dandelion root is able to kill 98% of cancer ..."
8,PolitiFact,A molecule found in a Himalayan fungus kills c...,"A molecule found in a Himalayan fungus ""kills ..."
9,PolitiFact,Cancer and matters to do with kidney failure k...,"""Cancer and matters to do with kidney failure ..."


# Defining filters 

## Semantic filters

### Entailment filter

In [None]:
def xnli_response(reference,candidate_list,threshold):

    all_alignments = []
    all_align_scores = []
    all_align_texts = []


    entailment_request_2 = {'queried_text': reference,
                  'comparison_text': candidate_list,
                  'th': threshold}
    ent_output =requests.get('private_request', json=entailment_request_2).json()
    ent_scores = list(zip(ent_output["contradiction_p"],
    ent_output["entailment_p"], ent_output["neutral_p"]))

    #the indices sort the previous index of the values by entailment. 
    #We can get the previous order by using the numbers of the indices
    ent_result = sorted(list(zip(ent_output["entailment"], ent_scores, ent_output["texts"], ent_output["indices"])), key = operator.itemgetter(3))
    
    all_alignments.extend(zipped[0] for zipped in ent_result)
    all_align_scores.extend(zipped[1] for zipped in ent_result)
    all_align_texts.extend(zipped[2] for zipped in ent_result)
        
    return all_alignments, all_align_scores, all_align_texts

In [None]:
def semantic_filter(reference,candidate_list, threshold=None):
  filtered_phrases = []
  scores =[]
  results = xnli_response(reference,candidate_list,threshold)
  for idx,result in enumerate(results[0]):
    if result == 'Entailment':
      filtered_phrases.append(results[2][idx])
      scores.append(results[1][idx])
  return filtered_phrases, scores

## Lexical filters

In [None]:
def rouge_score(candidate,reference,ngram=1 ,metric='f'):
  from rouge import Rouge
  rouge = Rouge()

  if ngram == 1:
    ngram = 'rouge-1'
  elif ngram == 2:
    ngram = 'rouge-2'
  elif ngram == 'l':
    ngram = 'rouge-l'

  all_scores = rouge.get_scores(candidate.lower(), reference.lower())
  
  return all_scores[0][ngram][metric]

In [None]:
def bleu_score(candidate, reference, ngram=1):
  from nltk.translate.bleu_score import sentence_bleu

  if ngram == 1:
    ngram = (1,0,0,0)
  if ngram == 2:
    ngram = (0,1,0,0)
  if ngram == 3:
    ngram = (0,0,1,0)
  if ngram == 4:
    ngram = (0,0,0,1)
  
  candidate = nltk.word_tokenize(candidate.lower())
  reference = nltk.word_tokenize(reference.lower())
  
  return sentence_bleu([reference], candidate, weights=ngram)
  

In [None]:
def jaccard_similarity(candidate, reference, ngram=1):
  import nltk
  nltk.download('punkt', quiet=True)

  candidate = nltk.word_tokenize(candidate.lower())
  reference = nltk.word_tokenize(reference.lower())

  if ngram != 1:
    candidate = nltk.ngrams(candidate, n=ngram)
    reference = nltk.ngrams(reference, n=ngram)

  candidate = set(candidate)
  reference = set(reference)
  intersection = len(list(candidate.intersection(reference)))
  union = (len(candidate) + len(reference)) - intersection
  
  return float(intersection) / union
  

In [None]:
def lexical_score(candidate, reference, method='rouge', ngram = 1, **kwargs):
  dispatcher={'rouge':rouge_score, 'bleu':bleu_score , 'jaccard':jaccard_similarity}
  try:
      get_score=dispatcher[method]
  except KeyError:
      raise ValueError('invalid input')
  lexical_score = get_score(candidate,reference,ngram, **kwargs)
  return lexical_score

# Filter pipeline

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
class paraphraser():
  phrase='' # here is saved the last phase used in paraphrases generation
  paraphrases=[] # here is saved the most recent paraphrased batch generated

  def __init__(self, paraphrase_model="ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"):
    import pandas as pd
    import requests
    import json
    import operator
    import numpy as np
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    from tqdm import tqdm
    
    self.paraphrase_model = paraphrase_model
    #load model
    self.model = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model)
    self.tokenizer = AutoTokenizer.from_pretrained(paraphrase_model)

  def generate(self, phrase, num_return_sequences=50):
    import torch

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.model = self.model.to(device)
    #generate with T5 Large
    if self.paraphrase_model == "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality":
      self.phrase = phrase
      text = "paraphrase: "+ phrase + " </s>"

      encoding = self.tokenizer.encode_plus(text,max_length = 128, padding=True, return_tensors="pt")
      input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

      self.model.eval()
      beam_outputs = self.model.generate(  #temperature=1,  top_k = 50, top_p = 0.95, repetition_penalty=1.5
      input_ids=input_ids,attention_mask=attention_mask,
      max_length=128,
      early_stopping=True,
      num_beams=num_return_sequences,
      num_return_sequences=num_return_sequences,
      top_p=0.95,
      repetition_penalty = 1.5)
  
      self.paraphrases = []
      for beam_output in beam_outputs:
        sent = self.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)[19:]
        self.paraphrases.append(sent)
      return self.paraphrases

    #generate with Pegasus
    if self.paraphrase_model ==  'tuner007/pegasus_paraphrase':
      text = phrase
      encoding =  self.tokenizer(text, max_length=128, padding=True, return_tensors='pt')
      input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

      self.model.eval()
      beam_outputs = self.model.generate( #temperature=1.5, tok_k=50, top_p=1.0, repetition_penalty=1.0
      input_ids=input_ids,attention_mask=attention_mask,
      max_length=128,
      early_stopping=True,
      num_beams=num_return_sequences,
      num_return_sequences=num_return_sequences,
      temperature=1.5)

      self.paraphrases = []
      for beam_output in beam_outputs:
        sent = self.tokenizer.decode(beam_output, skip_special_tokens=True)
        self.paraphrases.append(sent)
      return self.paraphrases
   
  def filter(self, reference=None,paraphrases = None, lexical_filter= 'rouge', ngram=1 , semantic_threshold=None, lexical_threshold=None, top_return='All', get_score=False, **kwargs):
      if reference == None:
        reference = self.phrase
      if paraphrases == None:
        paraphrases = self.paraphrases

      #semantic filter
      paraphrases_filtered = paraphrases
      #paraphrases_filtered = semantic_filter(reference,paraphrases,threshold=semantic_threshold)[0]
      #lexical filter
      if len(paraphrases_filtered) == 0:
        return [] 
      
      if lexical_threshold == None:
        lexical_threshold = 1
      
      scored = []
      for candidate in paraphrases_filtered:
        score = lexical_score(candidate=candidate, reference=reference, method=lexical_filter, ngram=ngram, **kwargs)
        if score < lexical_threshold:
          scored.append((candidate,score))
      scored.sort(key=lambda x:x[1])

      if top_return == 'All' or top_return > len(scored):
        print(len(scored))
        top_return = len(scored)
      
      if get_score:
        return scored[top_return]
      else:
        candidates = [ candidate[0] for candidate in scored]
        return candidates[:top_return]

# Running filter pipeline(T5-large / Rouge-1 / Entailment)



In [None]:
t5 = paraphraser()

##Testing with first hoax

In [None]:
hoaxdf['Normalized Hoax-checked'][0]

'huge spike in soft-tissue cancers diagnoses in 2021 as a result of COVID-19 vaccines'

In [None]:
t5.generate(hoaxdf['Normalized Hoax-checked'][0])

['In 2021, there was a dramatic rise in soft-tissue cancer diagnoses as a result of COVID-19 vaccines.',
 'In 2021, there was a significant rise in soft-tissue cancer diagnoses as a result of COVID-19 vaccines.',
 'As a result of COVID-19 vaccines, there was a significant rise in soft-tissue cancer diagnoses in 2021.',
 'As a result of COVID-19 vaccines, there was a dramatic rise in soft-tissue cancer diagnoses in 2021.',
 'As a result of the COVID-19 vaccines, there was a significant rise in soft-tissue cancer diagnoses in 2021.',
 'As a result of the COVID-19 vaccines, there was a dramatic rise in soft-tissue cancer diagnoses in 2021.',
 'In 2021, there was a surge in soft-tissue cancer diagnoses as a result of COVID-19 vaccines.',
 'In 2021, there was a rise in soft-tissue cancer diagnoses as a result of COVID-19 vaccines.',
 'In 2021, there was a dramatic rise in soft-tissue cancer diagnoses as a result of the COVID-19 vaccines.',
 'In 2021, there was a dramatic rise in soft-tissue

In [None]:
filtered_sorted = t5.filter()

In [None]:
filtered_sorted

['As a result of the COVID-19 vaccines, there was a significant rise in soft tissue cancer diagnoses in 2021.',
 'As a result of the COVID-19 vaccines, there was a dramatic rise in soft tissue cancer diagnoses in 2021.',
 'In 2021, there was a dramatic rise in soft tissue cancer diagnoses as a result of COVID-19 vaccines.',
 'In 2021, there was a significant rise in soft tissue cancer diagnoses as a result of COVID-19 vaccines.',
 'As a result of COVID-19 vaccines, there was a significant rise in soft tissue cancer diagnoses in 2021.',
 'As a result of COVID-19 vaccines, there was a dramatic rise in soft tissue cancer diagnoses in 2021.',
 'In 2021, there was a dramatic rise in soft-tissue cancer diagnoses as a result of COVID-19 vaccinations.',
 'In 2021, there was a significant rise in soft-tissue cancer diagnoses as a result of COVID-19 vaccinations.',
 'In 2021, there was a dramatic rise in soft-tissue cancer diagnosis as a result of COVID-19 vaccines.',
 'In 2021, there was a sign

## Paraphrasing hoaxes

In [None]:
t5 = paraphraser()

In [None]:
filtered_sorted_top_paraphrases = []
for n in tqdm(range(len(hoaxdf['Normalized Hoax-checked']))):
  t5.generate(hoaxdf['Normalized Hoax-checked'][n],num_return_sequences=50)
  filtered_sorted = t5.filter(lexical_filter= 'rouge', ngram=1 , semantic_threshold=None, lexical_threshold=None, top_return='All')
  filtered_sorted_top30_paraphrases.append(filtered_sorted)

  2%|▏         | 1/52 [00:01<01:12,  1.42s/it]

50


  4%|▍         | 2/52 [00:02<00:50,  1.00s/it]

50


  6%|▌         | 3/52 [00:03<00:49,  1.01s/it]

50


  8%|▊         | 4/52 [00:03<00:44,  1.07it/s]

50


 10%|▉         | 5/52 [00:04<00:42,  1.12it/s]

50


 12%|█▏        | 6/52 [00:05<00:37,  1.23it/s]

50


 13%|█▎        | 7/52 [00:06<00:34,  1.32it/s]

50


 15%|█▌        | 8/52 [00:07<00:35,  1.23it/s]

50


 17%|█▋        | 9/52 [00:08<00:40,  1.06it/s]

50


 19%|█▉        | 10/52 [00:09<00:49,  1.19s/it]

50


 21%|██        | 11/52 [00:10<00:41,  1.02s/it]

50


 23%|██▎       | 12/52 [00:11<00:40,  1.00s/it]

50


 25%|██▌       | 13/52 [00:12<00:34,  1.12it/s]

50


 27%|██▋       | 14/52 [00:13<00:35,  1.06it/s]

50


 29%|██▉       | 15/52 [00:13<00:31,  1.19it/s]

50


 31%|███       | 16/52 [00:14<00:28,  1.27it/s]

50


 33%|███▎      | 17/52 [00:15<00:27,  1.28it/s]

50


 35%|███▍      | 18/52 [00:16<00:27,  1.22it/s]

50


 37%|███▋      | 19/52 [00:16<00:26,  1.25it/s]

50


 38%|███▊      | 20/52 [00:17<00:24,  1.28it/s]

50


 40%|████      | 21/52 [00:18<00:23,  1.33it/s]

50


 42%|████▏     | 22/52 [00:19<00:21,  1.41it/s]

50


 44%|████▍     | 23/52 [00:19<00:19,  1.52it/s]

50


 46%|████▌     | 24/52 [00:20<00:19,  1.44it/s]

50


 48%|████▊     | 25/52 [00:20<00:17,  1.57it/s]

50


 50%|█████     | 26/52 [00:21<00:16,  1.59it/s]

50


 52%|█████▏    | 27/52 [00:22<00:15,  1.61it/s]

50


 54%|█████▍    | 28/52 [00:22<00:14,  1.67it/s]

50


 56%|█████▌    | 29/52 [00:23<00:14,  1.64it/s]

50


 58%|█████▊    | 30/52 [00:24<00:16,  1.35it/s]

50


 60%|█████▉    | 31/52 [00:24<00:14,  1.41it/s]

50


 62%|██████▏   | 32/52 [00:25<00:13,  1.45it/s]

50


 63%|██████▎   | 33/52 [00:26<00:12,  1.57it/s]

50


 65%|██████▌   | 34/52 [00:26<00:10,  1.68it/s]

50


 67%|██████▋   | 35/52 [00:27<00:10,  1.67it/s]

50


 69%|██████▉   | 36/52 [00:27<00:09,  1.63it/s]

50


 71%|███████   | 37/52 [00:28<00:08,  1.67it/s]

50


 73%|███████▎  | 38/52 [00:29<00:08,  1.64it/s]

50


 75%|███████▌  | 39/52 [00:29<00:07,  1.64it/s]

50


 77%|███████▋  | 40/52 [00:30<00:07,  1.53it/s]

50


 79%|███████▉  | 41/52 [00:30<00:07,  1.57it/s]

50


 81%|████████  | 42/52 [00:31<00:07,  1.42it/s]

50


 83%|████████▎ | 43/52 [00:32<00:06,  1.32it/s]

50


 85%|████████▍ | 44/52 [00:33<00:05,  1.36it/s]

50


 87%|████████▋ | 45/52 [00:33<00:04,  1.47it/s]

50


 88%|████████▊ | 46/52 [00:34<00:04,  1.47it/s]

50


 90%|█████████ | 47/52 [00:35<00:03,  1.40it/s]

50


 92%|█████████▏| 48/52 [00:36<00:02,  1.48it/s]

50


 94%|█████████▍| 49/52 [00:36<00:02,  1.37it/s]

50


 96%|█████████▌| 50/52 [00:37<00:01,  1.47it/s]

50


 98%|█████████▊| 51/52 [00:38<00:00,  1.39it/s]

50


100%|██████████| 52/52 [00:38<00:00,  1.34it/s]

50





In [None]:
hoaxdf['filtered_sorted_topparaphrases'] = filtered_sorted_top_paraphrases

#Generating keyword queries with querie generator(Private tool)

In [None]:
from query_builder_v3_1 import get_kw_comb_v2

In [None]:
query_builder_paraphrases_filtered = [] 
for n in tqdm(range(len(hoaxdf['filtered_sorted_top30_paraphrases']))):
  query_builder_paraphrases = []
  paraphrases_list = hoaxdf['filtered_sorted_top_paraphrases'][n]
  for paraphrase in paraphrases_list:
    statement = get_kw_comb_v2(statement_text=paraphrase , 
                              language_code="en", # <-- Important
                              lemmatized=False, 
                              level = 0, # <- keywords per combination =  n - level
                              top_n = 7, 
                              diversity = 0.2, 
                              max_query_len = 1024
                              )[0]
    if isinstance(statement, list):
      statement = statement[0]
    query_builder_paraphrases.append(statement)                       
  query_builder_paraphrases_filtered.append(query_builder_paraphrases)


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The 

In [None]:
query_builder_paraphrases_filtered

[['(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
  '(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
  '(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
  '(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
  '(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
  '(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
  '(((2021 OR 2.021)) rise diagnoses cancer vaccinations covid-19)',
  '(((2021 OR 2.021)) rise significant cancer vaccinations covid-19)',
  '(((2021 OR 2.021)) rise vaccines cancer diagnosis covid-19)',
  '(((2021 OR 2.021)) rise vaccines cancer diagnosis covid-19)',
  '(((2021 OR 2.021)) cancer tissue vaccines surge soft covid-19)',
  '(((2021 OR 2.021)) diagnoses cancer surge vaccinations covid-19)',
  '(((2021 OR 2.021)) rise diagnoses cancer vaccinations covid-19)',
  '(((2021 OR 2.021)) rise vaccines significant cancer covid-19)',
  '(((2021 OR 2.021)) diagnoses vaccines cancer rise c

In [None]:
hoaxdf['keyword_paraphrases'] = query_builder_paraphrases_filtered

In [None]:
hoax_key_queries = []
for hoax in tqdm(hoaxdf['Normalized Hoax-checked']):
  statement = get_kw_comb_v2(statement_text=hoax , 
                              language_code="en", # <-- Important
                              lemmatized=False, 
                              level = 0, # <- keywords per combination =  n - level
                              top_n = 7, 
                              diversity = 0.2, 
                              max_query_len = 1024
                              )[0]
  if isinstance(statement, list):
    statement = statement[0]
  hoax_key_queries.append(statement) 

  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern' will not be used"
  "The parameter 'token_pattern

In [None]:
hoaxdf['hoax_key_queries'] = hoax_key_queries 

In [None]:
 hoaxdf

Unnamed: 0,"Fact checker, institution",Normalized Hoax-checked,Original hoax Text from source or article tittle,filtered_sorted_top_paraphrases,keyword_paraphrases,hoax_key_queries
0,PolitiFact,huge spike in soft-tissue cancers diagnoses in...,Soft-tissue cancer diagnoses have “climbed thr...,"[As a result of the COVID-19 vaccines, there w...",[(((2021 OR 2.021)) rise cancer tissue vaccine...,(((2021 OR 2.021)) spike vaccines huge cancers...
1,PolitiFact,Cancer institute finally admits marijuana kill...,"""Cancer institute finally admits marijuana kil...",[The marijuana research has now confirmed that...,[(cancer research (cause OR confirmed) marijua...,(marijuana finally (kills OR admits) (cancer O...
2,PolitiFact,No one has died of cancer or heart disease sin...,“No one has died of cancer or heart disease si...,"[Since the COVID-19 thing was in existence, no...",[(heart existence since one cancer died covid-...,(since one cancer heart (started OR died) covi...
3,PolitiFact,80% of children born in developing countries d...,"""80% of children born in developing countries ...","[Every year, 80 percent of children in develop...","[(((""eighty"" OR 80)) percent nations cancer ye...",((80%) countries cancer children (developing O...
4,PolitiFact,Cutting out sugar and drinking hot lemon water...,Says cutting out sugar and drinking hot lemon ...,[Cancer can be reversed by a diet change and h...,[(hot water diet cancer lemon change reversed)...,(hot water sugar cancer lemon (drinking OR cure))
5,PolitiFact,Abortion increases the risk of breast cancer.,Says abortion increases the risk of breast can...,[Breast cancer prevention goes hand in hand wi...,[(hand abortions cancer breast prevention goes...,(abortion risk breast cancer increases)
6,PolitiFact,The flu shot is designed to spread cancer.,"The flu shot is ""designed to spread cancer.""",[The flu shot is supposed to cause disease in ...,[(humans disease flu shot (supposed OR cause))...,(flu cancer shot (designed OR spread))
7,PolitiFact,Dandelion root is able to kill 98% of cancer c...,"""Dandelion root is able to kill 98% of cancer ...","[In less than a week, Dandelion root can destr...",[((98%) less dandelion week tumor root destroy...,"(((""forty-eight"" OR 48) OR 98%) hours dandelio..."
8,PolitiFact,A molecule found in a Himalayan fungus kills c...,"A molecule found in a Himalayan fungus ""kills ...",[Cancer cells are killed by a molecule isolate...,"[(((""forty"" OR 40)) molecule cells fungus pote...","(((""forty"" OR 40)) molecule cells fungus poten..."
9,PolitiFact,Cancer and matters to do with kidney failure k...,"""Cancer and matters to do with kidney failure ...",[More people die in Kenya today from cancer an...,[(kidney tuberculosis cancer (killed OR aids O...,(tuberculosis malaria cancer kidney aids kill ...


Columns:

Original hoad: Normalized hoax-checked

Original hoax keyword query = hoax_key_queries

Paraphrases filtered = filtered_sorted_top_paraphrases 	

Paraphrases keyword queries = keyword_paraphrases

#Generating final query

Removing redundant keyword paraphrases

In [None]:
lexically_enriched_final_query_list = []
for n in tqdm(range(len(hoaxdf))):
  final_query = []
  final_query.append(hoaxdf['hoax_key_queries'][n])
  for paraphrase in hoaxdf['keyword_paraphrases'][n]:
    max_score=0
    for candidate in final_query:
      score = jaccard_similarity(candidate, paraphrase, ngram=1)
      if score > max_score:
        max_score =  score
    if max_score != 1:
      final_query.append(paraphrase)
  lexically_enriched_final_query_list.append(final_query)

100%|██████████| 52/52 [00:12<00:00,  4.24it/s]


In [None]:
lexically_enriched_final_query_list[0]

['(((2021 OR 2.021)) spike vaccines huge cancers covid-19)',
 '(((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19)',
 '(((2021 OR 2.021)) rise diagnoses cancer vaccinations covid-19)',
 '(((2021 OR 2.021)) rise significant cancer vaccinations covid-19)',
 '(((2021 OR 2.021)) rise vaccines cancer diagnosis covid-19)',
 '(((2021 OR 2.021)) cancer tissue vaccines surge soft covid-19)',
 '(((2021 OR 2.021)) diagnoses cancer surge vaccinations covid-19)',
 '(((2021 OR 2.021)) rise vaccines significant cancer covid-19)',
 '(((2021 OR 2.021)) diagnoses vaccines cancer rise covid-19)',
 '(((2021 OR 2.021)) rise vaccines cancer large covid-19)',
 '(((2021 OR 2.021)) rise vaccines major cancer covid-19)',
 '(((2021 OR 2.021)) rise vaccines drastic cancer covid-19)',
 '(((2021 OR 2.021)) rise big cancer vaccines covid-19)',
 '(((2021 OR 2.021)) vaccines big cancer increase covid-19)',
 '(((2021 OR 2.021)) diagnoses vaccines cancer surge covid-19)',
 '(((2021 OR 2.021)) vaccines cancer i

Concatenating final query

In [None]:
lexically_enriched_final_query = []
for query_list in lexically_enriched_final_query_list:
  final_query = query_list[0]
  for paraphrase_query in query_list[1:]:
    if len(final_query+' OR '+paraphrase_query) <= 1000:
      final_query = final_query+' OR '+paraphrase_query
  lexically_enriched_final_query.append(final_query)

In [None]:
lexically_enriched_final_query

['(((2021 OR 2.021)) spike vaccines huge cancers covid-19) OR (((2021 OR 2.021)) rise cancer tissue vaccines soft covid-19) OR (((2021 OR 2.021)) rise diagnoses cancer vaccinations covid-19) OR (((2021 OR 2.021)) rise significant cancer vaccinations covid-19) OR (((2021 OR 2.021)) rise vaccines cancer diagnosis covid-19) OR (((2021 OR 2.021)) cancer tissue vaccines surge soft covid-19) OR (((2021 OR 2.021)) diagnoses cancer surge vaccinations covid-19) OR (((2021 OR 2.021)) rise vaccines significant cancer covid-19) OR (((2021 OR 2.021)) diagnoses vaccines cancer rise covid-19) OR (((2021 OR 2.021)) rise vaccines cancer large covid-19) OR (((2021 OR 2.021)) rise vaccines major cancer covid-19) OR (((2021 OR 2.021)) rise vaccines drastic cancer covid-19) OR (((2021 OR 2.021)) rise big cancer vaccines covid-19) OR (((2021 OR 2.021)) vaccines big cancer increase covid-19) OR (((2021 OR 2.021)) diagnoses vaccines cancer surge covid-19) OR (((2021 OR 2.021)) vaccines big cancer jump covid-1

In [None]:
hoaxdf['final_lexical_enriched_query'] = lexically_enriched_final_query

In [None]:
hoaxdf.to_excel('/content/drive/MyDrive/working_directory/tfm/data/cancer_en_queries.xlsx', index=False)