In [None]:
!pip install emoji
!pip install -q pandarallel

In [None]:
!sudo apt-get -o Dpkg::Options::="--force-confmiss" install --reinstall netbase
!pip install git+https://github.com/crazyfrogspb/RedditScore.git

In [None]:
import pandas as pd
import sys
import json
from google.colab import drive
from redditscore.tokenizer import CrazyTokenizer
import nltk
import re
import emoji
import requests
from tqdm.notebook import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
import warnings
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt') # if necessary...
pd.set_option('display.max_colwidth', 255)
warnings.filterwarnings("ignore")
nltk.download('stopwords')
drive.mount('/content/drive')

In [None]:
path = '/content/drive/My Drive/COVID19 Fake News Detection in English/input/'
train_df = pd.read_csv(f"{path}Constraint_English_Train.csv")
test_df = pd.read_csv(f"{path}Constraint_English_Val.csv")
train_df = train_df.drop(columns=['id'])
#test_df = test_df.drop(columns=['id'])
train_df['target'] = train_df['label']
test_df['target'] = test_df['label']

In [None]:
len(test_df), len(train_df)

In [None]:
def rename_column(df):
  for index in range(len(df)):
    if df['label'][index] == "real":
      df.at[index, 'target'] = 0
    else:
      df.at[index, 'target'] = 1
  return df
train_df = rename_column(train_df)
test_df = rename_column(test_df)

In [None]:
train_df.head()

In [None]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]
print(cosine_sim('politically correct woman (almost) uses pandemic as excuse not to reuse plastic bag  #coronavirus #nashville', 'politically correct woman almost uses pandemic as excuse not to reuse plastic bag | the spoof'))
# print(cosine_sim('a little bird', 'a little bird chirps'))
# print(cosine_sim('a little bird', 'a big dog barks'))

In [None]:
def url_replace(df):
  len_df = len(df)
  for index in range(len_df):
    text = df['tweet'][index]
    urls = re.findall('(https?://[^\s]+)',  df['tweet'][index])
    for url in urls:
      try:
        resp = requests.head(url)
        orginal_location = resp.headers["Location"]
        df['tweet'][index] = re.sub(url, orginal_location, df['tweet'][index])
      except:
        continue
  return df
train_df = url_replace(train_df)
test_df = url_replace(test_df)
train_df.to_csv(f'{path}train_sec.csv',index=False)
test_df.to_csv(f'{path}test_sec.csv',index=False)

In [None]:
train_df = pd.read_csv(f'{path}train_sec.csv')
test_df = pd.read_csv(f'{path}test_sec.csv')

In [None]:
import os
def _store_data(load_data,link):
  path = f"/content/drive/My Drive/COVID19 Fake News Detection in English/input/{link}.json"
  if os.path.exists(path) == False:
      store_data = []
      store_data.append(load_data)
      with open(path, 'w') as fp:
          json.dump(store_data, fp)
  else:
      with open(path, 'r') as fp:
          data = json.load(fp)
      data.append(load_data)
      with open(path, 'w') as fp:
          json.dump(data, fp)
  return

def text_replace(df,link):
  tokenizer = CrazyTokenizer(urls='title')
  len_df = len(df)
  for index in range(len_df):
    text = df['tweet'][index]
    urls = re.findall('(https?://[^\s]+)',  df['tweet'][index])
    data = dict()
    data['index'] = index
    for url in urls:
      try:
        location_text = ' '.join(tokenizer.tokenize(url))
        data[url] = location_text
      except:
        continue
    print(index)
    _store_data(data,link)

text_replace(train_df,"train_link1")
text_replace(test_df,"test_link1")

In [None]:
def without_link_tweet(text):
  raw_text = re.sub(r'(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(https?://[^\s]+)', '', text, flags=re.MULTILINE)
  return raw_text
  
train_df['raw_tweet'] = train_df.parallel_apply(lambda x: without_link_tweet(x['tweet']), axis=1)
test_df['raw_tweet'] = test_df.parallel_apply(lambda x: without_link_tweet(x['tweet']), axis=1)

In [None]:
train_df['clean_tweet'] = train_df['tweet']
test_df['clean_tweet'] = test_df['tweet']
#train_df[6379:6382]

In [None]:
def link_replace(df, link):
  path = f"/content/drive/My Drive/COVID19 Fake News Detection in English/input/{link}.json"
  with open(path, 'r') as fp:
    data = json.load(fp)
  len_df = len(df)
  for index in range(len_df):
    raw_text = re.sub(r'(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(https?://[^\s]+)', '', df['clean_tweet'][index], flags=re.MULTILINE)
    urls = re.findall('(https?://[^\s]+)',df['clean_tweet'][index])
    if len(urls) > 0:
      for obj in data:
        if index == obj['index']:
          for url in urls:
            try:
              location_text = obj[url]
              if cosine_sim(raw_text.lower(), location_text.lower()) > 0.7:
                df['clean_tweet'][index] = re.sub(url, '', df['clean_tweet'][index])
              else:
                df['clean_tweet'][index] = re.sub(url, location_text, df['clean_tweet'][index])
            except:
              pass
  return df

train_df = link_replace(train_df, "train_link1")
test_df = link_replace(test_df,"test_link1")

In [None]:
train_df.iloc[6380]

In [None]:
train_df['target'].value_counts()

In [None]:
test_df['target'].value_counts()

In [None]:
emojis = []
for txt in train_df['tweet']:
  text = emoji.demojize(txt)
  text = re.findall(r'(:[!_\-\w]+:)', text)
  list_emoji = [emoji.emojize(x) for x in text]
  emojis.extend(list_emoji)

for txt in test_df['tweet']:
  text = emoji.demojize(txt)
  text = re.findall(r'(:[!_\-\w]+:)', text)
  list_emoji = [emoji.emojize(x) for x in text]
  emojis.extend(list_emoji)

In [None]:
print(emojis)

In [None]:
#emoji.demojize('\U0001f9ea ')
def replace_emoji(text):
    text = text.lower()
    text = re.sub(r"📍", "update", text) 
    text = re.sub(r"✅", "verified", text)
    text = re.sub(r"📢", "update", text)
    text = re.sub(r"📣", "update", text)
    text = re.sub(r"🔴", "warning", text)
    text = re.sub(r"👍", "approved", text)
    text = re.sub(r"🔰", "driving sign", text)
    text = re.sub(r"🙏", "thank you", text)
    text = re.sub(r"😷", "social distance", text)
    text = re.sub(r"❗", "excitement", text)
    text = re.sub(r"🌍", "europe africa", text)
    text = re.sub(r"🌎", "america", text)
    text = re.sub(r"🌏", "asia austrila", text) 
    text = re.sub(r"🌐", "globe", text) 
    text = re.sub(r"🏫", "school", text)
    text = re.sub(r"📺", "television", text)
    text = re.sub(r"👏", "Hands", text)
    text = re.sub(r"🤔", "thinking", text)
    text = re.sub(r"🏠", "stay at home", text)
    text = re.sub(r"🏡", "stay at home", text)
    text = re.sub(r"🕌", "mosque", text)
    text = re.sub(r"📈", "graph increasing", text)
    text = re.sub(r"📉", "graph decreasing", text) 
    text = re.sub(r"🙌", "Hands", text)
    text = re.sub(r"👐", "Hands", text)
    text = re.sub(r"👦", "boy", text)
    text = re.sub(r"❌", "not", text)
    text = re.sub(r"🙄", "rolling eye", text)
    text = re.sub(r"📏", "keep distance", text)
    text = re.sub(r"👥", "social distance", text)
    text = re.sub(r"💉", "vaccination", text)
    text = re.sub(r"\U0001f9ea", "vaccination", text)
    text = re.sub(r"💊", "pill", text)
    text = re.sub(r"🗣️", "speak", text)
    text = re.sub(r"🐍", "snake", text)
    text = re.sub(r"💊", "mike", text)
    text = re.sub(r"🏥", "hospital", text) 
    text = re.sub(r"🚑", "ambulance", text)  
    text = re.sub(r"🚨", "emergency", text) 


    text = re.sub(r"💧", "droplet", text)
    text = re.sub(r"💦", "droplet", text)
    text = re.sub(r"🧴", "sanitizer", text)
    text = re.sub(r"🧼", "Washing", text)
    text = re.sub(r"🚫", "not", text)
    text = re.sub(r"🚰", "water", text)
    text = re.sub(r"🤝", "water", text)
    text = re.sub(r"🧑‍🤝‍🧑", "water", text)
    text = re.sub(r"🧪", "test", text)
    text = re.sub(r"🌡️", "test", text)
    text = re.sub(r"🔬", "test", text)
    text = re.sub(r"📝", "contact tracing", text)
    text = re.sub(r"🧑‍⚕️", "health worker", text)
    text = re.sub(r"🐦", "safe hands", text)
    text = re.sub(r"🛡", "shielding required", text)
    text = re.sub(r"♿️", "accessibility requirements", text)
    text = re.sub(r"🆘", "emergency help required", text)
    with open('/content/drive/My Drive/COVID19 Fake News Detection in English/country_emoji.json') as json_file:
      data = json.load(json_file)
    for val in data:
      text = re.sub(val['emoji'], val['name'], text)
    text = text.strip(' ')
    return text
def digit_remove(text):
  text = re.sub("\d+", " ", text)
  #text = text.replace('$$', "19")
  return text

In [None]:
def preprocess(text):
  tok_words = []
  text = text.lower()
  text = text.split(' ')
  for tok_word in text: 
    if "#covid" in tok_word:
      sub_split_pos = tok_word.find('19')
      if sub_split_pos == -1: #coviduk
        if tok_word.strip() == "#covid":
          tok_words.append("covid-19")
        else:
          tok_words.append("covid-19")
          tok_words.append(tok_word[6:])

      elif sub_split_pos+2 == len(tok_word):
        tok_words.append("covid-19")
        #tok_words.append(tok_word.replace("#",""))

      else: #COVID-19Uganda
        fst = tok_word[:sub_split_pos+2]
        tok_words.append("covid-19")
        #tok_words.append(fst.replace("#",""))
        sec = tok_word[sub_split_pos+2:]
        tok_words.append(sec)
    else:
      tok_words.append(tok_word)
  text = ' '.join(tok_words)
  text = text.replace("19","$$")
  text = digit_remove(text)
  text = replace_emoji(text)
  text = text.strip()
  text = text.replace("$$","19")
  return text

In [None]:
t = "9/11 out of 13 people 🇯🇵 🇳🇴 🇱🇦 🆘 #COVID--19Updates #COVID19 25th Sep 2569 #CovidUK #COVID19Pandemic #covid19ireland #COVID19NZ #covid__19 #covid--19 #covid 19 #COVID-19Nigeria #COVID-19"
t = preprocess(t)
t

In [None]:
train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda s : preprocess(s))
test_df['clean_tweet'] = test_df['clean_tweet'].apply(lambda s : preprocess(s))

In [None]:
train_df.tail(10)

In [None]:
tokenizer = CrazyTokenizer(
    keepcaps=True,
    ignore_stopwords='english',
    #stem = 'stem',
    remove_punct=True, 
    remove_breaks=True,
    decontract=True,
    hashtags='split',
    twitter_handles='realname',
    urls='title',
    normalize=4
)

def concat_tok(text):
  text = tokenizer.tokenize(text)
  text = ' '.join(text)
  text = text.strip()
  return text

In [None]:
train_df['clean_tweet'] = train_df.parallel_apply(lambda x: concat_tok(x['clean_tweet']), axis=1)

In [None]:
test_df['clean_tweet'] = test_df.parallel_apply(lambda x: concat_tok(x['clean_tweet']), axis=1)

In [None]:
train_df.head()

In [None]:
train_df.to_csv(f'{path}train.csv',index=False)
test_df.to_csv(f'{path}test.csv',index=False)

### Check data after cleaning

In [None]:
path = "/content/drive/My Drive/COVID19 Fake News Detection in English/input/"
train_df = pd.read_csv(f"{path}train.csv")
test_df = pd.read_csv(f"{path}test.csv")
pred_df = pd.read_csv(f"{path}pred_test.csv")
pred_df.iloc[4]

id                                                                                                                                                                                                                                  4
tweet           6/10 Sky's @EdConwaySky explains the latest #COVID19 data and government announcement. Get more on the #coronavirus data here👇 https://trib.al/KM495gs https://twitter.com/SkyNews/status/1308050648160665601/video/1
label                                                                                                                                                                                                                            real
target                                                                                                                                                                                                                              0
raw_tweet                                                                       

In [None]:
pred_df[(pred_df["y_real"] == 1) & (pred_df["y_pred"] == 0)]

Unnamed: 0,id,tweet,label,target,raw_tweet,clean_tweet,y_real,y_pred,y_pred_probs
1,1,11 out of 13 people (from the Diamond Princess Cruise ship) who had intially tested negative in tests in Japan were later confirmed to be positive in the United States.,fake,1,11 out of 13 people (from the Diamond Princess Cruise ship) who had intially tested negative in tests in Japan were later confirmed to be positive in the United States.,people diamond princess cruise ship intially tested negative tests japan later confirmed positive united states,1.0,0.0,-1.133727
17,17,"Gov. Andrew Cuomo “was simply saying if we can share 20 percent of your excess your non-used ventilators to help people in other parts of the state on a voluntary basis that would be great. Of course there was a reaction to that which was not positive.""",fake,1,"Gov. Andrew Cuomo “was simply saying if we can share 20 percent of your excess your non-used ventilators to help people in other parts of the state on a voluntary basis that would be great. Of course there was a reaction to that which was not positive.""",gov andrew cuomo simply saying share percent excess non used ventilators help people parts state voluntary basis would great course reaction positive,1.0,0.0,-1.403269
44,44,"It’s been over six months since the first confirmed case of COVID-19 in the United States, and President Trump still doesn't have an effective plan to contain its spread. It's an unjustifiable failure of leadership that costs lives every day.",fake,1,"It’s been over six months since the first confirmed case of COVID-19 in the United States, and President Trump still doesn't have an effective plan to contain its spread. It's an unjustifiable failure of leadership that costs lives every day.",six months since first confirmed case covid-19 united states president trump still effective plan contain spread unjustifiable failure leadership costs lives every day,1.0,0.0,-0.378094
61,61,The Global #dietarysupplements market is anticipated to grow at a #CAGR of around 7.50% during 2020-25. \nCheck the full report:- https://bit.ly/30375Ww\n\n#food #beverage #marketresearch #marketstrategy #USA #animalhusbandry #covid_19 #kerrygroup #nu...,fake,1,The Global #dietarysupplements market is anticipated to grow at a #CAGR of around during 2020-25. \nCheck the full report:- \n\n#food #beverage #marketresearch #marketstrategy #USA #animalhusbandry #covid_19 #kerrygroup #nutrients #Immunity #Energy,global dietary supplements market anticipated grow cagr around check full report:- dietary supplements market analysis global report forecast food beverage marketresearch market strategy usa animal husbandry covid-19 kerry group nutrients immunity energy,1.0,0.0,-4.693911
145,145,There is no evidence that children have died because of a COVID-19 vaccine. No vaccine currently in development has been approved for widespread public use. https://bit.ly/2O2hFXh,fake,1,There is no evidence that children have died because of a COVID-19 vaccine. No vaccine currently in development has been approved for widespread public use.,evidence children died covid-19 vaccine vaccine currently development approved widespread public use,1.0,0.0,-4.063961
270,270,“Nearly half of (Missouri) counties have not reported positive (COVID-19) cases.”,fake,1,“Nearly half of (Missouri) counties have not reported positive (COVID-19) cases.”,nearly half missouri counties reported positive covid-19 cases,1.0,0.0,-3.713356
308,308,19 clarifications so that the population is no longer misled about COVID-19,fake,1,19 clarifications so that the population is no longer misled about COVID-19,19 clarifications population longer misled covid-19,1.0,0.0,-0.211989
437,437,The N95 respirator offers the most protection against viral particles for the wearer. But other masks are effective in reducing the reach of COVID-19 because they help stop asymptomatic individuals from unknowingly spreading the disease. https://bit.l...,fake,1,The N95 respirator offers the most protection against viral particles for the wearer. But other masks are effective in reducing the reach of COVID-19 because they help stop asymptomatic individuals from unknowingly spreading the disease.,n respirator offers protection viral particles wearer masks effective reducing reach covid-19 help stop asymptomatic individuals unknowingly spreading disease,1.0,0.0,-4.262012
452,452,An article from April announcing a new study “connects” the flu vaccine and an increased risk of coronavirus is based on data from 2017-18 and doesn’t include COVID-19. https://bit.ly/3jb1Iwl,fake,1,An article from April announcing a new study “connects” the flu vaccine and an increased risk of coronavirus is based on data from 2017-18 and doesn’t include COVID-19.,article april announcing new study connects flu vaccine increased risk coronavirus based data include covid-19,1.0,0.0,-0.79299
487,487,"Visit a hospital if you are experiencing cough, high fever",fake,1,"Visit a hospital if you are experiencing cough, high fever",visit hospital experiencing cough high fever,1.0,0.0,-0.549543


In [None]:
train_df

Unnamed: 0,tweet,label,target,raw_tweet,clean_tweet
0,The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.,real,0,The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.,cdc currently reports deaths general discrepancies death counts different sources small explicable death toll stands roughly people today
1,States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://twitter.com/COVID19Tracking/status/1288242179756847105/photo/1,real,0,States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths.,states reported deaths small rise last tuesday southern states reported deaths
2,Politically Correct Woman (Almost) Uses Pandemic as Excuse Not to Reuse Plastic Bag https://thespoof.com/4OJz #coronavirus #nashville,fake,1,Politically Correct Woman (Almost) Uses Pandemic as Excuse Not to Reuse Plastic Bag #coronavirus #nashville,politically correct woman almost uses pandemic excuse reuse plastic bag coronavirus nashville
3,#IndiaFightsCorona: We have 1524 #COVID testing laboratories in India and as on 25th August 2020 36827520 tests have been done : @ProfBhargava DG @ICMRDELHI #StaySafe #IndiaWillWin https://twitter.com/COVIDNewsByMIB/status/1298220752190070786/photo/1,real,0,#IndiaFightsCorona: We have 1524 #COVID testing laboratories in India and as on 25th August 2020 36827520 tests have been done : @ProfBhargava DG @ICMRDELHI #StaySafe #IndiaWillWin,india fights corona covid-19 testing laboratories india th august tests done dg stay safe india will win
4,Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://twitter.com/COVID19Tracking/status/...,real,0,Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC.,populous states generate large case counts look new cases per million today smaller states showing cases per million california texas al ar ks ky la ms nv sc
...,...,...,...,...,...
6415,A tiger tested positive for COVID-19 please stay away from your pets and birds.,fake,1,A tiger tested positive for COVID-19 please stay away from your pets and birds.,tiger tested positive covid-19 please stay away pets birds
6416,"???Autopsies prove that COVID-19 is??� a blood clot, not pneumonia, ???and ought to be fought with antibiotics??� and the whole world has been wrong in treating the ???so-called??� pandemic.",fake,1,"???Autopsies prove that COVID-19 is??� a blood clot, not pneumonia, ???and ought to be fought with antibiotics??� and the whole world has been wrong in treating the ???so-called??� pandemic.",autopsies prove covid-19 � blood clot pneumonia ought fought antibiotics � whole world wrong treating called � pandemic
6417,_A post claims a COVID-19 vaccine has already been developed and will cause widespread infertility._,fake,1,_A post claims a COVID-19 vaccine has already been developed and will cause widespread,post claims covid-19 vaccine already developed cause widespread infertility
6418,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,fake,1,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,aamir khan donate cr pm relief cares fund


In [None]:
tokenizer = CrazyTokenizer(urls='title')
#text = "Mike Pence in RNC speech praises Donald Trump’s COVID-19 “seamless” partnership with governors and leaves out the president's state feuds: https://t.co/qJ6hSewtgB #RNC2020 https://t.co/OFoeRZDfyY"
text = ""
' '.join(tokenizer.tokenize("https://www.politifact.com/factchecks/2020/jul/16/facebook-posts/2017-18-flu-season-study-does-not-include-covid-19/"))
#['where', 'is', 'my', 'job', 'then', 'bloomberg_domain']

In [None]:
import requests
resp = requests.head("https://to/38LZQDW")# ")https://t.co/OC6eR6H6lg
print(resp.status_code)
resp.headers["Location"]
#'https://www.bbc.co.uk/news/blogs-trending-47975564'

In [None]:
def hash_tag_count(df):
  hash_tag_words = []
  hash_tag = df.clean_tweet.str.findall(r'#.*?(?=\s|$)')
  for val in hash_tag:
    if(len(val) != 0):
      words = [word for word in val]
      hash_tag_words.extend(words)
  return Counter(hash_tag_words), len(hash_tag_words)
hash_tag_words, count = hash_tag_count(train_df)
print(hash_tag_words)
print(count)
hash_tag_words, count = hash_tag_count(test_df)
print(hash_tag_words)
print(count)

In [None]:
from collections import Counter
emojis = []
for txt in train_df['clean_tweet']:
  text = emoji.demojize(txt)
  text = re.findall(r'(:[!_\-\w]+:)', text)
  list_emoji = [emoji.emojize(x) for x in text]
  emojis.extend(list_emoji)

for txt in test_df['clean_tweet']:
  text = emoji.demojize(txt)
  text = re.findall(r'(:[!_\-\w]+:)', text)
  list_emoji = [emoji.emojize(x) for x in text]
  emojis.extend(list_emoji)
print(Counter(emojis))

In [None]:
#pip install extraction
import extraction
import requests
#url = "https://www.politifact.com/factchecks/2020/jul/16/facebook-posts/2017-18-flu-season-study-does-not-include-covid-19/"
#url = "https://twitter.com/SkyNews/status/1308050648160665601/video/1"
#url = "https://news.sky.com/story/coronavirus-uk-could-see-49-000-cases-a-day-by-mid-october-if-surge-continues-govt-advisers-warn-12077805"
url = "https://t.co/vIH89d6GZJ"
html = requests.get(url).text
extracted = extraction.Extractor().extract(html, source_url=url)
#extracted.descriptions
# >>> "Social Hierarchies in Engineering Organizations - Irrational Exuberance"
# >>> print extracted.title, extracted.description, extracted.image, extracted.url
print(extracted.titles)
print(extracted.descriptions)
print(extracted.images)
print(extracted.urls)

In [None]:
There is no evidence that children have died because of a COVID-19 vaccine. 
No vaccine currently in development has been approved for widespread public use. 