## 1. Data Cleaning

In [88]:
# initialisation
import pandas as pd

data_wd = 'Datasets\\'
dict_wd = 'Dictionary\\'
out_wd = 'Output\\'

In [89]:
# read the datasets
raw_data = pd.read_csv(data_wd + 'Tweets.csv')

In [90]:
# retrive the relevant columns
data = raw_data.copy()[['airline_sentiment', 'text']]
print(data.shape)
data.head()

(14640, 2)


Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [91]:
data = data.drop_duplicates(subset='text').reset_index()
data.drop(columns=['index'], inplace=True)
print(data.shape)
data.head()

(14427, 2)


Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [92]:
# remove all the tags in the data
def removeTags(intext):
    word_list = str(intext).split(' ')
    target_list = []
    for word in word_list:
        if '@' not in word:
            target_list.append(word)
    return ' '.join(target_list)

data['text'] = data.text.apply(removeTags)

In [93]:
# remove all the punctuation
import re
import string as sr

def removeSigns(intext):
    punc = sr.punctuation
    return re.sub(r"[%s]+" %punc, "",intext)

In [94]:
# count the words in each text
data['word_count'] = data.text.apply(lambda x: len(removeSigns(x).split()))

In [95]:
data.head()

Unnamed: 0,airline_sentiment,text,word_count
0,neutral,What said.,2
1,positive,plus you've added commercials to the experienc...,8
2,neutral,I didn't today... Must mean I need to take ano...,11
3,negative,"it's really aggressive to blast obnoxious ""ent...",16
4,negative,and it's a really big bad thing about it,9


In [96]:
data['text'] = data.text.apply(lambda x: x.lower())

In [97]:
data

Unnamed: 0,airline_sentiment,text,word_count
0,neutral,what said.,2
1,positive,plus you've added commercials to the experienc...,8
2,neutral,i didn't today... must mean i need to take ano...,11
3,negative,"it's really aggressive to blast obnoxious ""ent...",16
4,negative,and it's a really big bad thing about it,9
...,...,...,...
14422,positive,thank you we got on a different flight to chic...,10
14423,negative,leaving over 20 minutes late flight. no warnin...,22
14424,neutral,please bring american airlines to #blackberry10,6
14425,negative,"you have my money, you change my flight, and d...",22


## 2. Natual Language Processing

In [98]:
# initialise the nltk package
import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown
import numpy as np

In [99]:
# tokenise
import string as sr
def wordTokens(intext):
    stop_list = stopwords.words('english')
    word_list = [w for w in str(intext).split() if w not in stop_list]
    sentence = ' '.join(word_list)
    tokens = nltk.word_tokenize(removeSigns(sentence))
    target = [t for t in tokens]
    return ' '.join(target)

data['tokens'] = data.text.apply(wordTokens)

In [100]:
sr.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [101]:
# tokenise with tags
import string as sr
def wordTokens(intext):
    stop_list = stopwords.words('english')
    word_list = [w for w in str(intext).split() if w not in stop_list]
    sentence = ' '.join(word_list)
    tag_list = nltk.pos_tag(nltk.word_tokenize(removeSigns(sentence)))
    return tag_list

# save the result in 'word/tag' form
def wordTag(intext):
    pair_list = []
    for each in wordTokens(intext):
        pair_list.append('/'.join(each))
    return ' '.join(pair_list)

data['word_tag'] = data.text.apply(wordTag)

In [102]:
# get the target flag in stored data
def getFlag(intext, flag_list):
    target_list = []
    for pair in intext.split():
        if pair.split('/')[1] in flag_list:
            target_list.append(pair.split('/')[0])
    return ' '.join(target_list)

# get the dataframe of the required data
def flagDataFrame(series, flag, new_name):
    return pd.DataFrame(series.apply(getFlag, flag_list = flag).rename(new_name))

# get the word count for specific tag
def flagCount(intext, flag_list):
    count = 0
    for pair in intext.split():
        if pair.split('/')[1] in flag_list:
            count += 1
    return count

In [103]:
# example to get data frame
flagDataFrame(data.word_tag, ['NN'], 'nouns').head()

Unnamed: 0,nouns
0,
1,experience tacky
2,today trip
3,blast entertainment recourse
4,thing


In [104]:
# count the number of nouns
data['no_nouns'] = data.word_tag.apply(flagCount, flag_list = ["NN"])

In [105]:
data.to_csv(out_wd + 'word_flag.csv', encoding = 'utf-8')