### __Importing Libraries__

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import Word
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from autocorrect import Speller
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

### __Data Loading__

In [None]:
df = pd.read_csv('../../data/Tweets.csv')
data = df[['text','airline_sentiment']]

In [None]:
data["text"].head(20)

In [None]:
data.info()

### __Data Cleaning__
1. Missing Values
2. Data Types
3. Duplicates

In [None]:
# checking for missing values
data.isnull().sum()

In [None]:
# checking the description of the data
data.describe()

In [None]:
# ensuring that the "text" and "airline_sentiment" columns has unique datatypes
num_text_types = data['text'].apply(type).nunique()
num_sentiment_types = data['airline_sentiment'].apply(type).nunique()
print(f"n of datatypes in 'text': {num_text_types}")
print(f"n of datatypes in 'airline_sentiment': {num_sentiment_types}")

In [None]:
# gettting the row value of the duplicated rows in text column
duplicate_count = data['text'].duplicated().sum()
print(f"n of duplicate rows in 'text': {duplicate_count}")

In [None]:
# dropping the duplicated rows in "text" column
data = data.drop_duplicates(subset=['text'])

In [None]:
# checking the description of the data after dropping the duplicated rows
data.describe()

### __Text Preprocessing__
1. Lowercasing
2. URLs Handling
3. User Mentions Handling
4. English Abbreviations & Slang Handling
5. English Contractions Handling
6. Emoji/Emoticon Handling
7. Punctuation & Special Characters Handling
8. Stopwords Handling
9. Spell Checking
10. Lemmatization
11. Tokenization

In [None]:
# converting all text to lowercase
data['text'] = data['text'].str.lower()
data["text"].head(20)

In [None]:
# removing URLs
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+', '', x))
data["text"].head(20)

In [None]:
# removing user mentions
data['text'] = data['text'].apply(lambda x: re.sub(r'@\S+', '', x))
data["text"].head(20)

In [None]:
# changing abbreviations and slang to their standard forms
abbreviation_dict = {
    "u": "you",
    "bked": "booked",
    "thx": "thanks",
    "plz": "please",
    "sfo": "san francisco airport",
    "lax": "los angeles airport",
    "nyc": "new york city",
    "bos": "boston",
    "las": "las vegas",
    "dal": "dallas",
    "dca": "washington, d.c.",
    "lg": "likely good"
}

def text_std(text):
    words = text.split()
    new_words = []
    for word in words:
        if word in abbreviation_dict:
            word = abbreviation_dict[word]
        new_words.append(word)
    return " ".join(new_words)

data['text'] = data['text'].apply(text_std)
data["text"].head(20)

In [None]:
# handling english contractions
english_contractions_dict = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have",
    "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
    "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",
    "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not",
    "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us",
    "ma'am": "madam", "might've": "might have", "mightn't": "might not", "must've": "must have",
    "mustn't": "must not", "needn't": "need not", "shan't": "shall not", "she'd": "she would",
    "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "that'd": "that would", "that's": "that is", "there's": "there is", "they'd": "they would",
    "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not",
    "we'd": "we would", "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have",
    "where's": "where is", "who's": "who is", "who've": "who have", "won't": "will not",
    "would've": "would have", "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
    "you're": "you are", "you've": "you have"
}

def text_std(text):
    words = text.split()
    new_words = []
    for word in words:
        if word in english_contractions_dict:
            word = english_contractions_dict[word]
        new_words.append(word)
    return " ".join(new_words)

data['text'] = data['text'].apply(text_std)
data["text"].head(20)

In [None]:
# dealing with emojis
def convert_emojis(text):
    for emot in UNICODE_EMOJI: # like 😊
        if emot in text:
            text = text.replace(
                emot,
                " " + UNICODE_EMOJI[emot]
                    .replace(":", "")
                    .replace(",", "")
                    .replace("_", " ") + " "
                ).lower()
            
    for emo in EMOTICONS_EMO: # like :‑)
        if emo in text:
            text = text.replace(
                emo,
                " " + EMOTICONS_EMO[emo]
                    .replace(":", "")
                    .replace(",", "")
                    .replace("_", " ") + " "
                ).lower()
    return text

data['text'] = data['text'].apply(convert_emojis)
data["text"].head(20)

In [None]:
# removing punctuation and special characters
data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data["text"].head(20)

In [None]:
# removing stopwords
english_stopwords = stopwords.words("english")
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in english_stopwords))
data["text"].head(20)

In [None]:
spell = Speller(lang='en')

all_text = ' '.join(data['text'].astype(str).tolist())
unique_words = set(all_text.lower().split())

# correcting unique words: creating a dict of unique words and their corrected spellings
word_corrections = {word: spell(word) for word in unique_words}

def correct_sentence_with_map(sentence, corrections_map):
    words = sentence.split()
    corrected_words = [corrections_map.get(word.lower(), word) for word in words]

    return ' '.join(corrected_words)

data['text'] = data['text'].astype(str).apply(lambda x: correct_sentence_with_map(x, word_corrections))
data["text"].head(20)

In [None]:
# lemmatizing the text
data['text'] = data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data["text"].head(20)

In [None]:
# tokenizing the text
data['text'] = data['text'].apply(word_tokenize)
data["text"].head(20)

### __Exploratory visualization__
1. Top 20 most frequent words (bar chart)
2. Word cloud of word frequencies

In [None]:
# getting Top 20 most frequent words (bar chart)
all_words = []
for word_list in data['text']:
    if isinstance(word_list, list):
        for word in word_list:
            if isinstance(word, str):
                word = word.lower()
                if word.isalpha():
                    all_words.append(word)

word_counts = Counter(all_words)
top_20_words = word_counts.most_common(20)

words_for_plot, frequencies = zip(*top_20_words)

plt.figure(figsize=(14, 7))
sns.barplot(
    x=list(words_for_plot),
    y=list(frequencies),
    hue=list(words_for_plot),
    palette="viridis",
    legend=False
)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Top 20 Most Frequent words")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

for lem, freq in top_20_words:
    print(f"{lem}: {freq}")

In [None]:
# displaying the word cloud of the words frequencies
all_words_str = ' '.join(all_words)

wordcloud = WordCloud(width=800, height=400, background_color='black').generate(all_words_str)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### __Saving Cleaned Dataset__

In [None]:
# saving the dataset to csv file after being cleaned and preprocessed
data.to_csv("../../data/clean_Tweets.csv", index=False)