In [None]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import emoji


In [None]:
train = pd.read_excel("data/Constraint_English_Train.xlsx")
val = pd.read_excel("data/Constraint_English_Val.xlsx")
test = pd.read_excel("data/Constraint_English_Test.xlsx")
data = pd.concat([train, val, test], ignore_index=True)


# Data preprocessing


Here, we will clean the data and prepare it for the nlp tasks.
In this process, we will:

1. Make an initial cleaning and data extraction
2. Tokenize
3. Clean exhaustively
4. Lemmatize
5. Tag the parts of speech
6. Recognize named entities


In the initial cleaning, we'll be working with the tweet text and we'll add some features to the data, such as the number of words and characters in the tweet, also, the hashtags, mentions, and number of links.
Finally, we'll calculate the percentage of uppercase words in the tweet.


In [None]:
data["hashtags"] = data["tweet"].apply(lambda x: re.findall(r"#\w+", x))
# remove the # and lowercase
data["hashtags"] = data["hashtags"].apply(lambda list: [tag[1:].lower() for tag in list])
data["users"] = data["tweet"].apply(lambda x: re.findall(r"@\w+", x))
data["uppercase_percentage"] = data["tweet"].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))


In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='uppercase_percentage', data=data)
plt.title('Uppercase Percentage by Label')
plt.xlabel('Label')
plt.ylabel('Uppercase Percentage')
plt.show()

In [None]:
data["num_emojis"] = data["tweet"].apply(lambda x: emoji.emoji_count(x))


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='num_emojis', data=data)
plt.title('Number of Emojis by Label')
plt.xlabel('Label')
plt.ylabel('Number of Emojis')
plt.show()

In [None]:
data['clean_tweet'] = data['tweet'].apply(lambda x: emoji.demojize(x))


In [None]:
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: re.sub(r'(https?://\S+)', 'link', x))


Now, let's start the tokenization process.
This is a simple process, we'll split the text into words.
Those words can also be things like punctuation marks, numbers, etc.

We'll remove some of those unwanted tokens in the next step, the exhaustive cleaning.


In [None]:
data['tokens'] = data['clean_tweet'].apply(word_tokenize)


This cleaning process is a bit more complex.
It will try to keep all the words it can, but it will remove unwanted tokens, such as punctuation marks, numbers, and other things that are not words.
Also, those words may contain unwanted characters, and we'll remove them as well.


In [None]:
data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if re.match(r"[\.':\-\w]+", word)])
data['tokens'] = data['tokens'].apply(lambda tokens: [re.sub(r"[\.':\u200b]", "", word) for word in tokens])


Making the text lowercase is a good practice, as it will help us to avoid having the same word with different cases being treated as different words.


In [None]:
data['tokens'] = data['tokens'].apply(lambda tokens: [word.lower() for word in tokens])


The stopwords are words that are very common in the language and don't add much value to the text.
We'll remove them in this step, however, things like "not" and "no" are stopwords, but they are important in the sentiment analysis, so we'll keep them.


In [None]:
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda tokens: [word for word in tokens if word and (word not in stop_words or word == "no" or word == "not") and word != "-" and word != "_"])


Now, we'll lemmatize the words.


In [None]:
lemmatizer = WordNetLemmatizer()
data['tokens'] = data['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])


In [None]:
data['tokens'] = data['tokens'].apply(lambda tokens: ["coronavirus" if word in ['coronavirus', 'covid', 'covid19', 'covid-19', 'corona', 'covid_19', 'covid__19'] else word for word in tokens])

We'll also change the numbers to the word "number", as they don't add much information to the text and they increase the vocabulary size.

In [None]:
data['tokens'] = data['tokens'].apply(lambda tokens: [re.sub(r"\d+", "number", token) for token in tokens])

Then, we'll tag the parts of speech of the words.


In [None]:
data['pos'] = data['tokens'].apply(lambda tokens: nltk.pos_tag(tokens, tagset='universal'))


And finally, we'll recognize the named entities in the text.


In [None]:
#data['ner'] = data['tweet'].apply(lambda sentence: nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))))


In [None]:
#data.to_excel("data/cleaned_data.xlsx", index=False)

Reform the text to a string and save it to a new column in the dataframe.


In [None]:
data['cleanest_tweet'] = data['tokens'].apply(lambda tokens: [token for token in tokens if token not in ['coronavirus', 'number', 'link']]).apply(" ".join)


Visualize a word cloud with the most common words in the tweets.
We'll remove the words like 'coronavirus', 'number' and 'link' from the word cloud, as they are not very informative.


In [None]:
from wordcloud import WordCloud

plt.imshow(
    WordCloud().generate(" ".join(data[data['label'] == 'fake']['cleanest_tweet'].apply(" ".join))),
    interpolation="bilinear",
)
plt.axis("off")
plt.show()

In [None]:
from wordcloud import WordCloud

plt.imshow(
    WordCloud().generate(" ".join(data[data['label'] == 'real']['cleanest_tweet'].apply(" ".join))),
    interpolation="bilinear",
)
plt.axis("off")
plt.show()