### Methodology for testing further removal methods

In [608]:
from nlpDictionaries import PosMapper, PosList, Contraction_Dictionary2, stop_words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re, nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd

### Dictionaries for cleaning methods

The first dictionary includes contractions and their associated expansion.

In [609]:
Contraction_Dictionary1 = {
    "ain/t": "is not", "aren/t": "are not", "can/t": "can not", "can/t/ve": "can not have", "cause": "because", "could/ve": "could have",
    "couldn/t": "could not", "couldn/t/ve": "could not have", "didn/t": "did not", "doesn/t": "does not", "don/t": "do not", "hadn/t": "had not",
    "hadn/t/ve": "had not have", "hasn/t": "has not", "haven/t": "have not", "he/d": "he would", "he/d/ve": "he would have", "he/ll": "he will",
    "he/ll/ve": "he he will have", "he/s": "he is", "how/d": "how did", "how/d/y": "how do you", "how/ll": "how will", "how/s": "how is",
    "I/d": "I would", "I/d/ve": "I would have", "I/ll": "I will", "I/ll/ve": "I will have", "I/m": "I am", "I/ve": "I have", "i/d": "i would",
    "i/d/ve": "i would have", "i/ll": "i will", "i/ll/ve": "i will have", "i/m": "i am", "i/ve": "i have", "isn/t": "is not", "it/d": "it would",
    "it/d/ve": "it would have", "it/ll": "it will", "it/ll/ve": "it will have", "it/s": "it is", "let/s": "let us", "ma/am": "madam", "mayn/t": "may not",
    "might/ve": "might have", "mightn/t": "might not", "mightn/t/ve": "might not have", "must/ve": "must have", "mustn/t": "must not", "mustn/t/ve": "must not have",
    "needn/t": "need not", "needn/t/ve": "need not have", "o/clock": "of the clock", "oughtn/t": "ought not", "oughtn/t/ve": "ought not have", "shan/t": "shall not",
    "sha/n/t": "shall not", "shan/t/ve": "shall not have", "she/d": "she would", "she/d/ve": "she would have", "she/ll": "she will", "she/ll/ve": "she will have",
    "she/s": "she is", "should/ve": "should have", "shouldn/t": "should not", "shouldn/t/ve": "should not have", "so/ve": "so have", "so/s": "so as",
    "that/d": "that would", "that/d/ve": "that would have", "that/s": "that is", "there/d": "there would", "there/d/ve": "there would have",
    "there/s": "there is", "they/d": "they would", "they/d/ve": "they would have", "they/ll": "they will", "they/ll/ve": "they will have", "they/re": "they are",
    "they/ve": "they have", "to/ve": "to have", "wasn/t": "was not", "we/d": "we would", "we/d/ve": "we would have", "we/ll": "we will", "we/ll/ve": "we will have", 
    "we/re": "we are", "we/ve": "we have", "weren/t": "were not", "what/ll": "what will", "what/ll/ve": "what will have","what/re": "what are", "what/s": "what is", 
    "what/ve": "what have", "when/s": "when is", "when/ve": "when have", "where/d": "where did", "where/s": "where is", "where/ve": "where have",
    "who/ll": "who will", "who/ll/ve": "who will have", "who/s": "who is", "who/ve": "who have", "why/s": "why is", "why/ve": "why have", "will/ve": "will have", 
    "won/t": "will not","won/t/ve": "will not have", "would/ve": "would have", "wouldn/t": "would not", "wouldn/t/ve": "would not have", "y/all": "you all",
    "y/all/d": "you all would", "y/all/d/ve": "you all would have", "y/all/re": "you all are", "y/all/ve": "you all have", "you/d": "you would",
    "you/d/ve": "you would have", "you/ll": "you will", "you/ll/ve": "you will have", "you/re": "you are", "you/ve": "you have"
}


This list contains an edited list of stopwords, with all negation words (e.g. 'no', 'never', 'not') excluded.

In [610]:
stop_words =['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself',
            'yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself',
            'they','them','their','theirs','themselves','what','which','who','whom','this','that',
            'these','those','am','is','are','was','were','be','been','being','have','has','had',
            'having','do','does','did','doing','a','an','the','and','but','if','or','because','as',
            'until','while','of','at','by','for','with','about','against','between','into','through',
            'during','before','after','above','below','to','from','up','down','in','out','on','off',
            'over','under','again','further','then','once','here','there','when','where','why','how',
            'all','any','both','each','few','more','most','other','some','such',
            'only','own','same','so','than','too','very','can','will','just','should',
            'now','uses','use','using','used','one','also']

This second list contains the nltk.wordnet labelling convertion for verbs, adjectives, nouns and adverbs. The purpose of this list is to only lemmatize words that are POS (part-of-speech) tagged with these labels.

In [611]:
PosList =["JJ","JJR","JJS","NN","NNS","NNP","NNPS","RB",
          "RBR","RBS","VB","VBD","VBG","VBN","VBP","VBZ"]

This second dictionary uses the POS tag label as a key to refer to the root/lemma of a word. The purpose of this is to identify words with these POS tags and lemmatize them to their root lemma. E.g. 'running' --> 'run'

In [612]:
PosMapper = {
"JJ": wordnet.ADJ,
"JJR": wordnet.ADJ,
"JJS": wordnet.ADJ,
"NN": wordnet.NOUN,
"NNS": wordnet.NOUN,
"NNP": wordnet.NOUN,
"NNPS": wordnet.NOUN,
"RB": wordnet.ADV,
"RBR": wordnet.ADV,
"RBS": wordnet.ADV,
"VB": wordnet.VERB,
"VBD": wordnet.VERB,
"VBG": wordnet.VERB,
"VBN": wordnet.VERB,
"VBP": wordnet.VERB,
"VBZ": wordnet.VERB}

In [613]:
#Innitialize the lemmatizer
lemmatizer = WordNetLemmatizer()

### Text normalization/standardization method

Method amended to selectively remove some punctuation. Other complicated puntuation is dealt with by separately defined methods.

In [614]:
# This method normalizes the text into a coherent format for matching
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.lower() # Convert to lowercase
    df[text_field] = df[text_field].str.replace('http','') # removing urls is useful to make vocabulary small as possible
    df[text_field] = df[text_field].str.replace('com', '') # same as above.
    df[text_field] = df[text_field].str.replace(r"@\S+", " ")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at") #  replacing at sign for a word
    df[text_field] = df[text_field].str.replace(".", " ")
    df[text_field] = df[text_field].str.replace(",", " ")
    df[text_field] = df[text_field].str.replace("-", " ")
    df[text_field] = df[text_field].str.replace("(", " ")
    df[text_field] = df[text_field].str.replace(")", " ")
    df[text_field] = df[text_field].str.replace('"', " ")
    df[text_field] = df[text_field].str.replace("?", "")
    df[text_field] = df[text_field].str.replace("!", "")
    return df

### Contractions Expansion Prep
In the data, contraction words such as wouldn't are noted as 'wouldn`t' ` which is a different character to the normal apostrophe. Therefore each instance is changed to a '/' in order to match contractions to the contraction dictionary equivalent.

In [615]:
# This method strips the ` and changes is to / in order to match contractions.
def contractionPrep(df, text_field):
    df[text_field] = df[text_field].str.lstrip(' ')
    df[text_field] = df[text_field].str.replace("`", '/')
    return df

### Contraction Expansion Method 

In [616]:
# This method expands all contractions to their original format
def expand_contractions(text, contraction_mapping=Contraction_Dictionary2):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

### Underscore Removal Method 

In [617]:
def underscoreCleaner(list_words):
    for tweet in list_words:
        for token in tweet:
            token = token.replace('_',' ').strip() # replace underscore and strip leading and trailing spaces for each string
    return list_words

### Tokenisation Method 

In [618]:
def tokenizer(x):
    listOfTokens = []
    for text in x:
        text = str(text)
        text = word_tokenize(text)
        listOfTokens.append(text)
    return listOfTokens

### Double letter removal Method
After an innitial inspection into word frequency, single letter words were very frequent and didnt seem to contribute much semantic meaning the tweets, so were therefore removed.

- Method has been amended to remove two-lettered words because they contribute no meaning to tweets. However it means we loose cases like 'xx'

In [619]:
def doubleLetterRemoval(list_object):
    listOfTokens = []
    for tweet in list_object:
        temp = []
        for word in tweet:
            if len(word) > 2:
                temp.append(word)
        listOfTokens.append(temp)
    return listOfTokens

### Number Removal Method
Similarly to single letters, numbers dont contribute much meaning to the polarity of a tweet and so therefore removed.

- Method has been amended to remove every instance of numbers and all leading and trailing spaces

In [620]:
def numberRemoval(list_object):
    pattern = '[0-9]'
    for tweet in list_object:
        for token in tweet:
            token = re.sub(pattern, '', token).strip()
    return list_object

### Fix lengthening of words

Method that fixes instances such as 'arhhh' --> 'ah'

In [621]:
def reduce_lengthening(list_object):
    for tweet in list_object:
        for token in tweet:
            pattern = re.compile(r"(.)\1{2,}")
            token = pattern.sub(r"\1\1", token)
    return list_object

### Spell Check Method

Method only returns words listed in the English Dictionary

In [622]:
def spellChecker(list_object):
    tokens = []
    import enchant # This is a python spell checker form the PyEnchant library 
    d = enchant.Dict("en_Uk")
    for token in list_object:
        if d.check(token) == True:
            tokens.append(token)
    return tokens

### Stopword Removal Method
Stopwords are the most frequent words in the corpus and only create noise for the classifier so were therefore removed.

In [623]:
def stopwordRemoval(list_object):
    listOfTokens = []
    for tweet in list_object:
        temp = []
        for word in tweet:
            if not word in stop_words:
                temp.append(word)
        listOfTokens.append(temp)
    return listOfTokens

### Spell Check Method

In [624]:
#Spell checker test for lists of list of tokens
def spellChecker(list_object):
    tokens = []
    import enchant # This is a python spell checker form the PyEnchant library 
    d = enchant.Dict("en_Uk")
    for tweet in list_object:
        temp = []
        for token in tweet:
            if d.check(token) == True:
                temp.append(token)
        tokens.append(temp)
    return tokens

### Lemmatization Method
Calls the Pos list and dictionary to return certain words into their root lemma format.

In [625]:
def lemma(list_object):
    tags = []
    for words in list_object:
        posTupples = nltk.pos_tag(words)
        text = [lemmatizer.lemmatize(k[0], pos=PosMapper.get(k[1])) if k[1] in PosList else k[0] for k in posTupples]
        tags.append(text)
    return tags

### Read in dataset and clean it.
In order to compare the raw dataset to the cleaned version, two datasets are created. 

In [626]:
# Read in data to clean
data = pd.read_csv('train.csv')

#Read in raw dataset to test later
data2 = pd.read_csv('train.csv')

In [627]:
# Drop 'selected text' column
data = data.drop(columns='selected_text')
data2 = data2.drop(columns='selected_text')

### Standardize datasets
Each dataset is standardized (puntuation removed, converted to lower case, url-like stuff removed etc.)

In [628]:
# Standardize Text
data = standardize_text(data,'text')
data2 = standardize_text(data2,'text')
data2.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,i`d have responded if i were going,neutral
1,549e992a42,sooo sad i will miss you here in san diego,negative
2,088c60f138,my boss is bullying me,negative
3,9642c003ef,what interview leave me alone,negative
4,358bd9e861,sons of why couldn`t they put them on t...,negative


### Expand contractions
Only the dataset that is being cleaned calls these methods.

In [629]:
# Get data ready for Contraction Expansion
data = contractionPrep(data,'text')

In [630]:
# Expand Contractions
cleanedData = [expand_contractions(str(tweet)) for tweet in data['text']]
data['text'] = cleanedData
data.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,i/d have responded if i were going,neutral
1,549e992a42,sooo sad i will miss you here in san diego,negative
2,088c60f138,my boss is bullying me,negative
3,9642c003ef,what interview leave me alone,negative
4,358bd9e861,sons of why couldn/t they put them on th...,negative


After inspecting the results there were still some square brackets remaining as part of some words so these needed to be removed.

In [631]:
#strip remaining / from data
data['text'] = data['text'].str.replace('/', '')

In [632]:
token_list = data['text'].tolist()
print(token_list[:5])

['id have responded  if i were going', 'sooo sad i will miss you here in san diego', 'my boss is bullying me   ', 'what interview leave me alone', 'sons of       why couldnt they put them on the releases we already bought']


In [633]:
#Convert list of tokens to string
list_corpus = []
for tokens in token_list:
    tokens = tokens.replace("[","").replace("]","").replace("'","").replace(",","")
    list_corpus.append(tokens)
print(list_corpus[:5])

['id have responded  if i were going', 'sooo sad i will miss you here in san diego', 'my boss is bullying me   ', 'what interview leave me alone', 'sons of       why couldnt they put them on the releases we already bought']


In [634]:
listOfTokens = tokenizer(list_corpus) #change to data['text'] for original word count
print(listOfTokens[:5]) #Print what text data looks like in first tweet

[['id', 'have', 'responded', 'if', 'i', 'were', 'going'], ['sooo', 'sad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'diego'], ['my', 'boss', 'is', 'bullying', 'me'], ['what', 'interview', 'leave', 'me', 'alone'], ['sons', 'of', 'why', 'couldnt', 'they', 'put', 'them', 'on', 'the', 'releases', 'we', 'already', 'bought']]


In [635]:
#Create Vocabulary
all_words = [token for tokens in listOfTokens for token in tokens]
vocab1 = sorted(list(set(all_words)))
print("%s tokens total, with a vocabulary size of %s" % (len(all_words), len(vocab1)))
#sentence_lengths = [len(tokens) for tokens in listOfTokens]

356359 tokens total, with a vocabulary size of 26590


### Tokenize Data
This is applied to both datasets.

In [636]:
# Tokenize Data
data['text'] = listOfTokens

#Tokenize raw dataset without cleaning methods
data2['text'] = listOfTokens

In [637]:
data2.head(2)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"[id, have, responded, if, i, were, going]",neutral
1,549e992a42,"[sooo, sad, i, will, miss, you, here, in, san,...",negative


In [638]:
data.head(2)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"[id, have, responded, if, i, were, going]",neutral
1,549e992a42,"[sooo, sad, i, will, miss, you, here, in, san,...",negative


### Call rest of cleaning methods on the dataset that is being cleaned.

In [639]:
#Underscore Removal:
tweetData = data['text'].tolist()
usRemoved = underscoreCleaner(tweetData)
data['text'] = usRemoved

#Double letter removal:
tweetData = data['text'].tolist()
slRemoved = doubleLetterRemoval(tweetData)
data['text'] = slRemoved

#number removal
tweetData = data['text'].tolist()
nRemoved = numberRemoval(tweetData)
data['text'] = nRemoved

#stopword removal
tweetData = data['text'].tolist()
noiseRemoved = stopwordRemoval(tweetData)
data['text'] = noiseRemoved


#spell checker
tweetData = data['text'].tolist()
checked = spellChecker(tweetData)
data['text'] = checked
print(data.head(5))

       textID                                    text sentiment
0  cb774db0d1                      [responded, going]   neutral
1  549e992a42                             [sad, miss]  negative
2  088c60f138                        [boss, bullying]  negative
3  9642c003ef               [interview, leave, alone]  negative
4  358bd9e861  [sons, put, releases, already, bought]  negative


### Lemmatize the cleaned Dataset 

In [640]:
#Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

#Lemmatize Data
tweetData = data['text'].tolist()
lemmatizedData = lemma(tweetData)
data['text'] = lemmatizedData

### See what the data looks like:

In [641]:
data.head(5) # cleanDataset

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"[respond, go]",neutral
1,549e992a42,"[sad, miss]",negative
2,088c60f138,"[boss, bully]",negative
3,9642c003ef,"[interview, leave, alone]",negative
4,358bd9e861,"[son, put, release, already, buy]",negative


In [642]:
data2.head(5) # rawDataset

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"[id, have, responded, if, i, were, going]",neutral
1,549e992a42,"[sooo, sad, i, will, miss, you, here, in, san,...",negative
2,088c60f138,"[my, boss, is, bullying, me]",negative
3,9642c003ef,"[what, interview, leave, me, alone]",negative
4,358bd9e861,"[sons, of, why, couldnt, they, put, them, on, ...",negative


### Export dataframes to csv files: 

In [643]:
#Export to CSV
#data.to_csv(r'C:\Users\Alex\Desktop\fdmNLPproject\cleanedData2.csv', index=False)
#data2.to_csv(r'C:\Users\Alex\Desktop\fdmNLPproject\rawData3.csv', index=False)

In [644]:
#Inspect vocabularies of newly cleaned dataset:
listOfTokens = data['text'].tolist()
print(token_list[:5])

['id have responded  if i were going', 'sooo sad i will miss you here in san diego', 'my boss is bullying me   ', 'what interview leave me alone', 'sons of       why couldnt they put them on the releases we already bought']


In [646]:
#Create Vocabulary
all_words = [token for tokens in listOfTokens for token in tokens]
vocab1 = sorted(list(set(all_words)))
print("After checking whether words are in UK dictionary: %s tokens total, with a vocabulary size of %s" % (len(all_words), len(vocab1)))
#sentence_lengths = [len(tokens) for tokens in listOfTokens]

After checking whether words are in UK dictionary: 155737 tokens total, with a vocabulary size of 9651


Size of corpus before further word removal: 198338 tokens total, with a vocabulary size of 23262
Vocab size down by 252 token types

Original size corpus: 356359 tokens total, with a vocabulary size of 26590