# This Notebook Preprocessed Tweets by Lowering Casing, Standardizing Covid-19 Vocabulary, Lemmatizing Words, and Removing Punctuation and StopWords and Emoji

#### It outputs a file in the data folder called `covid_lies_preprocessed.csv`
##### Techniques From: https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing/data#Lemmatization

In [1]:
# Allow Python to find our own project modules
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import string
import re
import nltk

In [3]:
df = pd.read_csv('../data/covid_lies_data.csv')
df['tweet_orig'] = df.tweet

## Lower Casing

In [4]:
df.tweet = df.tweet.str.lower()
df.head()

Unnamed: 0,misconception_id,misconception,tweet,tweet_id,label,tweet_orig
0,3,Coronavirus is genetically engineered.,how the covid-19 outbreak is changing global p...,1233965490948591616,na,How the COVID-19 outbreak is changing global p...
1,30,Blowing conch shells destroys coronavirus pote...,getting coronavirus and then coughing on peopl...,1233907923765559296,na,Getting coronavirus and then coughing on peopl...
2,57,Swans and dolphins swimming in Venice canals f...,disturbing letter about life in covid-19 ward ...,1233911842910720000,na,Disturbing letter about life in COVID-19 ward ...
3,22,Cocaine cures coronavirus.,how to prevent corona virus?ðŸ¤” use cowdung cake...,1233947734094290944,na,How to prevent corona virus?ðŸ¤” Use cowdung cake...
4,32,Observing janata curfew will result in the red...,this is concerning - they must self-insure for...,1233937085297332224,na,This is concerning - They must self-insure for...


## Remove Punctuation

In [5]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df.tweet = df.tweet.apply(lambda tweet: remove_punctuation(tweet))
df.head()

Unnamed: 0,misconception_id,misconception,tweet,tweet_id,label,tweet_orig
0,3,Coronavirus is genetically engineered.,how the covid19 outbreak is changing global po...,1233965490948591616,na,How the COVID-19 outbreak is changing global p...
1,30,Blowing conch shells destroys coronavirus pote...,getting coronavirus and then coughing on peopl...,1233907923765559296,na,Getting coronavirus and then coughing on peopl...
2,57,Swans and dolphins swimming in Venice canals f...,disturbing letter about life in covid19 ward i...,1233911842910720000,na,Disturbing letter about life in COVID-19 ward ...
3,22,Cocaine cures coronavirus.,how to prevent corona virusðŸ¤” use cowdung cakes...,1233947734094290944,na,How to prevent corona virus?ðŸ¤” Use cowdung cake...
4,32,Observing janata curfew will result in the red...,this is concerning they must selfinsure for w...,1233937085297332224,na,This is concerning - They must self-insure for...


## Remove Emoji

In [6]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df.tweet = df.tweet.apply(lambda tweet: remove_emoji(tweet))

## Change common Covid-19 synonyms to "coronavirus"

In [7]:
covid_synonyms = set(['covid19', 'covid', 'corona virus', '^corona$'])

def convert_covid_synonyms(tweet):
    for covid_synonym in covid_synonyms:
        tweet = re.sub(covid_synonym, "coronavirus", tweet)
    return tweet

df.tweet = df.tweet.apply(lambda tweet: convert_covid_synonyms(tweet))

## Remove Stop Words

### Download Stop Words if needed

In [8]:
try:
    from nltk.corpus import stopwords
except Exception as e:
    print(e)
    print("Downloading Stopwords")
    nltk.download('stopwords')

In [9]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(tweet):
    return " ".join([word for word in tweet.split() if word not in STOPWORDS])

df.tweet = df.tweet.apply(lambda tweet: remove_stopwords(tweet))
df.head()

Unnamed: 0,misconception_id,misconception,tweet,tweet_id,label,tweet_orig
0,3,Coronavirus is genetically engineered.,coronavirus19 outbreak changing global politics,1233965490948591616,na,How the COVID-19 outbreak is changing global p...
1,30,Blowing conch shells destroys coronavirus pote...,getting coronavirus coughing people accelerati...,1233907923765559296,na,Getting coronavirus and then coughing on peopl...
2,57,Swans and dolphins swimming in Venice canals f...,disturbing letter life coronavirus19 ward chin...,1233911842910720000,na,Disturbing letter about life in COVID-19 ward ...
3,22,Cocaine cures coronavirus.,prevent coronavirus use cowdung cakes holika d...,1233947734094290944,na,How to prevent corona virus?ðŸ¤” Use cowdung cake...
4,32,Observing janata curfew will result in the red...,concerning must selfinsure workers comp health...,1233937085297332224,na,This is concerning - They must self-insure for...


## Lemmatization

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(tweet):
    pos_tagged_text = nltk.pos_tag(tweet.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

try:
    df.tweet= df.tweet.apply(lambda tweet: lemmatize_words(tweet))
except Exception as e:
    print(e)
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    
df.head()

Unnamed: 0,misconception_id,misconception,tweet,tweet_id,label,tweet_orig
0,3,Coronavirus is genetically engineered.,coronavirus19 outbreak change global politics,1233965490948591616,na,How the COVID-19 outbreak is changing global p...
1,30,Blowing conch shells destroys coronavirus pote...,get coronavirus cough people accelerationist p...,1233907923765559296,na,Getting coronavirus and then coughing on peopl...
2,57,Swans and dolphins swimming in Venice canals f...,disturb letter life coronavirus19 ward china a...,1233911842910720000,na,Disturbing letter about life in COVID-19 ward ...
3,22,Cocaine cures coronavirus.,prevent coronavirus use cowdung cake holika da...,1233947734094290944,na,How to prevent corona virus?ðŸ¤” Use cowdung cake...
4,32,Observing janata curfew will result in the red...,concern must selfinsure worker comp healthcare...,1233937085297332224,na,This is concerning - They must self-insure for...


## Save processed data

In [11]:
FILENAME = 'covid_lies_processed.csv'

with open(f'../data/{FILENAME}', 'w', encoding='utf-8') as fp:
    df.to_csv(fp)

# Evaluation

## Most Common Words

In [12]:
from collections import Counter
cnt = Counter()
for text in df.tweet.values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(30)

[('coronavirus', 4186),
 ('coronavirus19', 2665),
 ('people', 1122),
 ('virus', 838),
 ('china', 787),
 ('u', 670),
 ('flu', 669),
 ('case', 652),
 ('say', 637),
 ('get', 590),
 ('spread', 574),
 ('hand', 555),
 ('use', 545),
 ('amp', 533),
 ('prevent', 482),
 ('die', 472),
 ('test', 466),
 ('death', 458),
 ('pandemic', 432),
 ('health', 425),
 ('outbreak', 410),
 ('kill', 400),
 ('go', 388),
 ('disease', 385),
 ('first', 383),
 ('mask', 377),
 ('make', 376),
 ('take', 365),
 ('like', 353),
 ('new', 339)]

In [13]:
# for row in df[['tweet', 'tweet_orig']].iterrows():
#     print(row[1].tweet)
#     print("--------")
#     print(row[1].tweet_orig)
#     print("############")
