In [311]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import Word
from autocorrect import Speller
import matplotlib.pyplot as plt

In [312]:
df = pd.read_csv('../../data/Tweets.csv')
data = df[['text','airline_sentiment']]

In [313]:
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [314]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               14640 non-null  object
 1   airline_sentiment  14640 non-null  object
dtypes: object(2)
memory usage: 228.9+ KB


### __Data Cleaning__
1. Missing Values
2. Data Types
3. Duplicates

In [315]:
# checking for missing values
data.isnull().sum()

text                 0
airline_sentiment    0
dtype: int64

In [316]:
# checking the description of the data
data.describe()

Unnamed: 0,text,airline_sentiment
count,14640,14640
unique,14427,3
top,@united thanks,negative
freq,6,9178


In [317]:
# ensuring that the "text" and "airline_sentiment" columns has unique datatypes
num_text_types = data['text'].apply(type).nunique()
num_sentiment_types = data['airline_sentiment'].apply(type).nunique()
print(f"n of datatypes in 'text': {num_text_types}")
print(f"n of datatypes in 'airline_sentiment': {num_sentiment_types}")

n of datatypes in 'text': 1
n of datatypes in 'airline_sentiment': 1


In [318]:
# gettting the row value of the duplicated rows in text column
duplicate_count = data['text'].duplicated().sum()
print(f"n of duplicate rows in 'text': {duplicate_count}")

n of duplicate rows in 'text': 213


In [319]:
# dropping the duplicated rows in "text" column
data = data.drop_duplicates(subset=['text'])

In [320]:
# checking the description of the data after dropping the duplicated rows
data.describe()

Unnamed: 0,text,airline_sentiment
count,14427,14427
unique,14427,3
top,@AmericanAir we have 8 ppl so we need 2 know h...,negative
freq,1,9080


### __Text Preprocessing__
1. Lowercasing
4. URLs Handling
5. User Mentions Handling
6. Hashtags Handling
3. Punctuation & Special Characters Handling
English Contractions Handling

8. Stopwords Handling
2. Text standardization (abbreviations & slang handling)
Emoji/Emoticon Handling via regex
7. Tokenization
9. Lemmatization
Exploratory visualization
    Top 20 most frequent stems (bar chart)
    Word cloud of stem frequencies

In [321]:
# converting all text to lowercase
data['text'] = data['text'].str.lower()
data.head()

Unnamed: 0,text,airline_sentiment
0,@virginamerica what @dhepburn said.,neutral
1,@virginamerica plus you've added commercials t...,positive
2,@virginamerica i didn't today... must mean i n...,neutral
3,@virginamerica it's really aggressive to blast...,negative
4,@virginamerica and it's a really big bad thing...,negative


In [322]:
# removing URLs
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,@virginamerica what @dhepburn said.,neutral
1,@virginamerica plus you've added commercials t...,positive
2,@virginamerica i didn't today... must mean i n...,neutral
3,@virginamerica it's really aggressive to blast...,negative
4,@virginamerica and it's a really big bad thing...,negative


In [323]:
# removing user mentions
data['text'] = data['text'].apply(lambda x: re.sub(r'@\S+', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,what said.,neutral
1,plus you've added commercials to the experien...,positive
2,i didn't today... must mean i need to take an...,neutral
3,"it's really aggressive to blast obnoxious ""en...",negative
4,and it's a really big bad thing about it,negative


In [324]:
# removing hashtags
data['text'] = data['text'].apply(lambda x: re.sub(r'#\S+', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,what said.,neutral
1,plus you've added commercials to the experien...,positive
2,i didn't today... must mean i need to take an...,neutral
3,"it's really aggressive to blast obnoxious ""en...",negative
4,and it's a really big bad thing about it,negative


In [325]:
# changing abbreviations and slang to their standard forms
abbreviation_dict = {
    "bked": "booked",
    "thx": "thanks",
    "plz": "please",
    "sfo": "san francisco airport",
    "lax": "los angeles airport",
    "nyc": "new york city",
    "bos": "boston",
    "las": "las vegas",
    "dal": "dallas",
    "dca": "washington, d.c.",
    "lg": "likely good"
}

def text_std(text):
    words = text.split()
    new_words = []
    for word in words:
        if word in abbreviation_dict:
            word = abbreviation_dict[word]
        new_words.append(word)
    return " ".join(new_words)

data['text'] = data['text'].apply(text_std)
data.head()

Unnamed: 0,text,airline_sentiment
0,what said.,neutral
1,plus you've added commercials to the experienc...,positive
2,i didn't today... must mean i need to take ano...,neutral
3,"it's really aggressive to blast obnoxious ""ent...",negative
4,and it's a really big bad thing about it,negative


In [326]:
# handling english contractions
english_contractions_dict = {
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
    "could've": "could have", "couldn't": "could not", "couldn't've": "could not have",
    "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
    "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",
    "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not",
    "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us",
    "ma'am": "madam", "might've": "might have", "mightn't": "might not", "must've": "must have",
    "mustn't": "must not", "needn't": "need not", "shan't": "shall not", "she'd": "she would",
    "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "that'd": "that would", "that's": "that is", "there's": "there is", "they'd": "they would",
    "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not",
    "we'd": "we would", "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have",
    "where's": "where is", "who's": "who is", "who've": "who have", "won't": "will not",
    "would've": "would have", "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
    "you're": "you are", "you've": "you have"
}

def text_std(text):
    words = text.split()
    new_words = []
    for word in words:
        if word in english_contractions_dict:
            word = english_contractions_dict[word]
        new_words.append(word)
    return " ".join(new_words)

data['text'] = data['text'].apply(text_std)
data.head()

Unnamed: 0,text,airline_sentiment
0,what said.,neutral
1,plus you have added commercials to the experie...,positive
2,i did not today... must mean i need to take an...,neutral
3,"it is really aggressive to blast obnoxious ""en...",negative
4,and it is a really big bad thing about it,negative


In [327]:
# removing punctuation and special characters
data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,what said,neutral
1,plus you have added commercials to the experie...,positive
2,i did not today must mean i need to take anoth...,neutral
3,it is really aggressive to blast obnoxious ent...,negative
4,and it is a really big bad thing about it,negative


In [328]:
english_stopwords = stopwords.words("english")
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x.lower() not in english_stopwords))
data.head()

Unnamed: 0,text,airline_sentiment
0,said,neutral
1,plus added commercials experience tacky,positive
2,today must mean need take another trip,neutral
3,really aggressive blast obnoxious entertainmen...,negative
4,really big bad thing,negative


In [329]:
# tokenizing the text
data['text'] = data['text'].apply(word_tokenize)
data.head()

Unnamed: 0,text,airline_sentiment
0,[said],neutral
1,"[plus, added, commercials, experience, tacky]",positive
2,"[today, must, mean, need, take, another, trip]",neutral
3,"[really, aggressive, blast, obnoxious, enterta...",negative
4,"[really, big, bad, thing]",negative
