# Pre-processing

In [35]:
import re
import pandas as pd

file_path = './Airline-Sentiment-2-w-AA.csv'

In [36]:
all_data = pd.read_csv(file_path, encoding='iso-8859-2')
all_data[-2:]

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
14638,681679797,False,finalized,3,2/25/15 18:59,negative,1.0,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2/22/15 11:59,5.69587e+17,New Jersey,Eastern Time (US & Canada)
14639,681679798,False,finalized,3,2/25/15 19:06,neutral,0.6771,,0.0,American,,daviddtwu,,0,@AmericanAir we have 8 ppl so we need 2 know h...,,2/22/15 11:58,5.69587e+17,"dallas, TX",


需要注意的是本地使用 `vscode` 打开 `Airline-Sentiment-2-w-AA.csv` 发现总行数为 14874 行，而 `pd.read_csv` 得到的表只有 14640 行。仔细查看文件本身内容发现其中应该还包含一些非数据的内容，`pd.read_csv` 应该是自动地过滤掉了，而所得的 16460 行数据也与论文中对于数据集的描述一致：
> Our data is available online. It has **14640 valid tweets** from 2/17/2015 to 2/24/2015 related to reviews of major U.S. airlines, containing sentiment label, negative reason label, tweets content and other meta information like location, user ID etc. The data fraction is roughly 15% positive, 65% negative, and 20% neutral.

## Tweet-level

In [37]:
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:
    tweet = re.sub(r'(:\s?\)|:-\)|\(:)', ' EMO-POS ', tweet)
    # Laugh -- :D, :-D, xD, XD
    tweet = re.sub(r'(\s:D|:-D|\sxD|\sXD)', ' EMO-POS ', tweet)
    # Wink -- ;-), ;)
    tweet = re.sub(r'(;-?\))', ' EMO-POS ', tweet)
    # Sad -- :(, : (, :-(
    tweet = re.sub(r'(:\(|\s:\s\(|:-\()', ' EMO-NEG ', tweet)
    # Cry -- :'(
    tweet = re.sub(r'(:\'\()', ' EMO-NEG ', tweet)
    return tweet

In [38]:
def preprocess_tweet(tweet):
    processed_tweet = []
    # Replace emojis with either EMO-POS or EMO-NEG
    tweet = handle_emojis(tweet)
    # Convert to lower case
    tweet = tweet.lower()
    # Replace URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER-MENTION
    tweet = re.sub(r'@[\S]+', 'USER-MENTION', tweet)
    # Replace #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    
    messyCodeRegex = r'[^a-zA-Z0-9\~\`\!\@\#\$\%\^\&\*\(\)\-\—\+\=\{\}\[\]\:\;\"\'\<\>\,\.\?\/\ ]+'
    # Replace messy code and _ with a single space
    tweet = re.sub(messyCodeRegex, ' ', tweet)
    
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    
    return tweet

## Word-level

## Generate processed data file

In [39]:
all_data = all_data[['_unit_id', 'airline_sentiment', 'text']]

In [41]:
all_data['text'] = all_data['text'].apply(preprocess_tweet)

In [43]:
all_data.to_csv('processed_data.csv')