In [1]:
import re
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [2]:
df_data_covid_19 = pd.read_csv('data/corona_fake.csv')
print(df_data_covid_19.shape)
print(df_data_covid_19.columns)

(1164, 4)
Index(['title', 'text', 'source', 'label'], dtype='object')


In [3]:
df_data_covid_19.head(10)

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,Fake
1,,Hydroxychloroquine has been shown to have a 10...,RudyGiuliani,Fake
2,,Fact: Hydroxychloroquine has been shown to hav...,CharlieKirk,Fake
3,,The Corona virus is a man made virus created i...,JoanneWrightForCongress,Fake
4,,Doesn’t @BillGates finance research at the Wuh...,JoanneWrightForCongress,Fake
5,CORONA UNMASKED: Chinese Intelligence Officer ...,,,
6,,Urgent: Health Bulletin to the Public. Ministr...,Ministry of Health,Fake
7,,"Pls tell ur families, relatives and friendsMOH...",NWLLAB,Fake
8,,SERIOUS EXCELLENT ADVICE by Japanese doctors t...,Japanese doctors treating COVID-19 cases,Fake
9,Basic protective measures against the new coro...,Stay aware of the latest information on the CO...,https://www.who.int/emergencies/diseases/novel...,TRUE


In [4]:
def remove_none_column(df):
    # the number of those missing values from each column.
    print("number of missing title\t:", df[df['title'].isna()].shape[0])
    print("number of missing text\t:", df[df['text'].isna()].shape[0])
    print("number of missing source\t:", df[df['source'].isna()].shape[0])
    print("number of missing label\t:", df[df['label'].isna()].shape[0])
    # Remove all columns that contain None
    return df.dropna()

df_data_covid_19=remove_none_column(df_data_covid_19)
print(df_data_covid_19['label'].unique())

number of missing title	: 82
number of missing text	: 10
number of missing source	: 20
number of missing label	: 5
['Fake' 'TRUE' 'fake']


In [6]:
def arranging_label_column(df):
    # Convert the word fake to the word FAKE
    df.loc[df['label'] == 'fake', 'label'] = 'FAKE'
    df.loc[df['label'] == 'Fake', 'label'] = 'FAKE'
    print('number_of_fakes: ', df.loc[df['label'] == 'FAKE'].count()[0])
    print('number_of_trues: ', df.loc[df['label'] == 'TRUE'].count()[0])
    return df

df_data_covid_19=arranging_label_column(df_data_covid_19)
print(df_data_covid_19['label'].unique())

number_of_fakes:  480
number_of_trues:  579
['FAKE' 'TRUE']


In [7]:
df_data_covid_19.head(10)

Unnamed: 0,title,text,source,label
0,Due to the recent outbreak for the Coronavirus...,"You just need to add water, and the drugs and ...",coronavirusmedicalkit.com,FAKE
9,Basic protective measures against the new coro...,Stay aware of the latest information on the CO...,https://www.who.int/emergencies/diseases/novel...,TRUE
14,Exposing yourself to the sun or to temperature...,"You can catch COVID-19, no matter how sunny or...",https://www.who.int/emergencies/diseases/novel...,TRUE
16,Being able to hold your breath for 10 seconds ...,The most common symptoms of COVID-19 are dry c...,https://www.who.int/emergencies/diseases/novel...,TRUE
17,Drinking alcohol does not protect you against ...,Frequent or excessive alcohol consumption can ...,https://www.who.int/emergencies/diseases/novel...,TRUE
18,COVID-19 virus can be transmitted in areas wit...,"From the evidence so far, the COVID-19 virus c...",https://www.who.int/emergencies/diseases/novel...,TRUE
19,Cold weather and snow CANNOT kill the new coro...,There is no reason to believe that cold weathe...,https://www.who.int/emergencies/diseases/novel...,TRUE
20,Taking a hot bath does not prevent the new cor...,Taking a hot bath will not prevent you from ca...,https://www.who.int/emergencies/diseases/novel...,TRUE
21,The new coronavirus CANNOT be transmitted thro...,To date there has been no information nor evid...,https://www.who.int/emergencies/diseases/novel...,TRUE
22,Are hand dryers effective in killing the new c...,No. Hand dryers are not effective in killing t...,https://www.who.int/emergencies/diseases/novel...,TRUE


## Text preprocessing

In [8]:
def data_clearing(text):
    # Lowering letters
    text = text.lower()
    # Removing html tags
    text = re.sub(r'<[^>]*>', '', text)
    # Removing twitter usernames
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    # Removing urls
    text = re.sub('https?://[A-Za-z0-9]', '', text)
    # Removing numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # distribution of tokens
    word_tokens = word_tokenize(text)
    # remove stopwords
    filtered_sentence = [word_token for word_token in word_tokens if word_token not in stop_words ]
    # Joining words
    text = (' '.join(filtered_sentence))
    
    return text

In [9]:
# Implement a cline function on the entire dataframe
for col_name in df_data_covid_19.columns:
    df_data_covid_19[col_name] = df_data_covid_19[col_name].apply(data_clearing)


In [10]:
df_data_covid_19.head(10)

Unnamed: 0,title,text,source,label
0,due recent outbreak coronavirus covid world he...,need add water drugs vaccines ready administer...,coronavirusmedicalkit com,fake
9,basic protective measures new coronavirus,stay aware latest information covid outbreak a...,ww int emergencies diseases novel coronavirus ...,true
14,exposing sun temperatures higher c degrees pre...,catch covid matter sunny hot weather countries...,ww int emergencies diseases novel coronavirus ...,true
16,able hold breath seconds without coughing feel...,common symptoms covid dry cough tiredness feve...,ww int emergencies diseases novel coronavirus ...,true
17,drinking alcohol protect covid dangerous,frequent excessive alcohol consumption increas...,ww int emergencies diseases novel coronavirus ...,true
18,covid virus transmitted areas hot humid climates,evidence far covid virus transmitted areas inc...,ww int emergencies diseases novel coronavirus ...,true
19,cold weather snow kill new coronavirus,reason believe cold weather kill new coronavir...,ww int emergencies diseases novel coronavirus ...,true
20,taking hot bath prevent new coronavirus disease,taking hot bath prevent catching covid normal ...,ww int emergencies diseases novel coronavirus ...,true
21,new coronavirus transmitted mosquito bites,date information evidence suggest new coronavi...,ww int emergencies diseases novel coronavirus ...,true
22,hand dryers effective killing new coronavirus,hand dryers effective killing ncov protect new...,ww int emergencies diseases novel coronavirus ...,true


In [12]:
df_data_covid_19.to_csv ('data/preprocessing.csv', index = False, header=True)