In [107]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import Word
from autocorrect import Speller
import matplotlib.pyplot as plt

In [108]:
df = pd.read_csv('../../data/Tweets.csv')
data = df[['text','airline_sentiment']]

In [109]:
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               14640 non-null  object
 1   airline_sentiment  14640 non-null  object
dtypes: object(2)
memory usage: 228.9+ KB


### __Data Cleaning__
1. Missing Values
2. Data Types
3. Duplicates

In [111]:
# checking for missing values
data.isnull().sum()

text                 0
airline_sentiment    0
dtype: int64

In [112]:
# checking the description of the data
data.describe()

Unnamed: 0,text,airline_sentiment
count,14640,14640
unique,14427,3
top,@united thanks,negative
freq,6,9178


In [113]:
# ensuring that the "text" and "airline_sentiment" columns has unique datatypes
num_text_types = data['text'].apply(type).nunique()
num_sentiment_types = data['airline_sentiment'].apply(type).nunique()
print(f"n of datatypes in 'text': {num_text_types}")
print(f"n of datatypes in 'airline_sentiment': {num_sentiment_types}")

n of datatypes in 'text': 1
n of datatypes in 'airline_sentiment': 1


In [114]:
# gettting the row value of the duplicated rows in text column
duplicate_count = data['text'].duplicated().sum()
print(f"n of duplicate rows in 'text': {duplicate_count}")

n of duplicate rows in 'text': 213


In [115]:
# dropping the duplicated rows in "text" column
data = data.drop_duplicates(subset=['text'])

In [116]:
# checking the description of the data after dropping the duplicated rows
data.describe()

Unnamed: 0,text,airline_sentiment
count,14427,14427
unique,14427,3
top,@AmericanAir we have 8 ppl so we need 2 know h...,negative
freq,1,9080


### __Text Preprocessing__
1. Lowercasing

2. Text standardization (abbreviations & slang handling)
3. Punctuation & Special Characters Removal
4. URLs Removal
5. User Mentions Handling
6. Hashtags Handling
Emoji/emoticon removal via regex
7. Tokenization
8. Stop Words Removal
9. Lemmatization
Exploratory visualization
    Top 20 most frequent stems (bar chart)
    Word cloud of stem frequencies

In [117]:
# converting all text to lowercase
data['text'] = data['text'].str.lower()

data.head()

Unnamed: 0,text,airline_sentiment
0,@virginamerica what @dhepburn said.,neutral
1,@virginamerica plus you've added commercials t...,positive
2,@virginamerica i didn't today... must mean i n...,neutral
3,@virginamerica it's really aggressive to blast...,negative
4,@virginamerica and it's a really big bad thing...,negative


In [118]:
# removing URLs
# data['text'] = data['text'].str.replace(r'http\S+', '')
data['text'] = data['text'].apply(lambda x: re.sub(r'http\S+', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,@virginamerica what @dhepburn said.,neutral
1,@virginamerica plus you've added commercials t...,positive
2,@virginamerica i didn't today... must mean i n...,neutral
3,@virginamerica it's really aggressive to blast...,negative
4,@virginamerica and it's a really big bad thing...,negative


In [119]:
# removing user mentions
# data['text'] = data['text'].str.replace(r'@\S+', '')
data['text'] = data['text'].apply(lambda x: re.sub(r'@\S+', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,what said.,neutral
1,plus you've added commercials to the experien...,positive
2,i didn't today... must mean i need to take an...,neutral
3,"it's really aggressive to blast obnoxious ""en...",negative
4,and it's a really big bad thing about it,negative


In [120]:
# removing hashtags
# data['text'] = data['text'].str.replace(r'#\S+', '')
data['text'] = data['text'].apply(lambda x: re.sub(r'#\S+', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,what said.,neutral
1,plus you've added commercials to the experien...,positive
2,i didn't today... must mean i need to take an...,neutral
3,"it's really aggressive to blast obnoxious ""en...",negative
4,and it's a really big bad thing about it,negative


In [121]:
# removing punctuation and special characters
# data['text'] = data['text'].str.replace(r'[^\w\s]', '')
data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data.head()

Unnamed: 0,text,airline_sentiment
0,what said,neutral
1,plus youve added commercials to the experienc...,positive
2,i didnt today must mean i need to take anothe...,neutral
3,its really aggressive to blast obnoxious ente...,negative
4,and its a really big bad thing about it,negative


In [122]:
# changing abbreviations and slang to their standard forms
abbreviation_dict = {
    "bked": "booked",
    "thx": "thanks",
    "plz": "please",
    "sfo": "san francisco airport",
    "lax": "los angeles airport",
    "nyc": "new york city",
    "bos": "boston",
    "las": "las vegas",
    "dal": "dallas",
    "dca": "washington, d.c.",
    "lg": "likely good"
}

def text_std(text):
    words = text.split()
    new_words = []
    for word in words:
        word = re.sub(r'[^\w\s]','',word)
        if word.lower() in abbreviation_dict:
            word = abbreviation_dict[word.lower()]
        new_words.append(word)
    return " ".join(new_words)

data['text'] = data['text'].apply(text_std)

In [123]:
# tokenizing the text
data['text'] = data['text'].apply(word_tokenize)
data.head()

Unnamed: 0,text,airline_sentiment
0,"[what, said]",neutral
1,"[plus, youve, added, commercials, to, the, exp...",positive
2,"[i, didnt, today, must, mean, i, need, to, tak...",neutral
3,"[its, really, aggressive, to, blast, obnoxious...",negative
4,"[and, its, a, really, big, bad, thing, about, it]",negative
