### Further word Removal Method tests

In [95]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

In [96]:
cleanData = pd.read_csv(r'cleanedData.csv')
cleanData.head(5)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"['would', 'respond', 'go']",neutral
1,549e992a42,"['sooo', 'sad', 'miss', 'san', 'diego']",negative
2,088c60f138,"['boss', 'bully']",negative
3,9642c003ef,"['interview', 'leave', 'alone']",negative
4,358bd9e861,"['son', 'could', 'not', 'put', 'release', 'alr...",negative


In [97]:
token_list = cleanData['text'].tolist()
print(token_list[:5])

["['would', 'respond', 'go']", "['sooo', 'sad', 'miss', 'san', 'diego']", "['boss', 'bully']", "['interview', 'leave', 'alone']", "['son', 'could', 'not', 'put', 'release', 'already', 'buy']"]


In [98]:
#Convert list of tokens to string
list_corpus = []
for tokens in token_list:
    tokens = tokens.replace("[","").replace("]","").replace("'","").replace(",","")
    list_corpus.append(tokens)
print(list_corpus[:5])

['would respond go', 'sooo sad miss san diego', 'boss bully', 'interview leave alone', 'son could not put release already buy']


In [99]:
from nltk.tokenize import word_tokenize
listOfTokens = []
def tokenizer(x):
    for text in x:
        text = str(text)
        text = word_tokenize(text)# You can also use sent_tokenize depending on what type of NLP you do
        listOfTokens.append(text)
tokenizer(list_corpus) #change to data['text'] for original word count

#Print what text data looks like in first tweet
print(listOfTokens[:5])

[['would', 'respond', 'go'], ['sooo', 'sad', 'miss', 'san', 'diego'], ['boss', 'bully'], ['interview', 'leave', 'alone'], ['son', 'could', 'not', 'put', 'release', 'already', 'buy']]


In [100]:
#Create Vocabulary
all_words = [token for tokens in listOfTokens for token in tokens]
vocab1 = sorted(list(set(all_words)))
print("%s tokens total, with a vocabulary size of %s" % (len(all_words), len(vocab1)))
#sentence_lengths = [len(tokens) for tokens in listOfTokens]

198338 tokens total, with a vocabulary size of 23262


In [101]:
#Inspect vocabulary
rubbish = vocab1[:1859]

In [102]:
print(rubbish)

['000th', '00am', '00pm', '02mxjj', '05ixbj', '06am', '07am', '07jzs', '07k6e', '07k6x', '07kbq', '07kjr', '07xfs', '07xoh', '07xoi', '08kaifj', '097dfj', '0_o', '0a7v3j', '0f', '0ghz', '0guyoj', '0k', '0l2tsj', '0mqko', '0n', '0rpm', '0ut', '0xhu5j', '0zywwj', '1000th', '1000x', '100greatestgames', '100msg', '100th', '100x', '103f', '104m6wj', '10am', '10days', '10jsepj', '10k', '10m', '10mins', '10mm', '10p', '10pm', '10th', '10ty', '10uemq', '10yr', '1155hours', '117th', '11am', '11e', '11pm', '11th', '11w', '121908inlove', '12am', '12f0y', '12hr', '12k', '12lbs', '12p', '12r3c3', '12s', '12seconds', '12st', '12th', '12yr', '137cty', '1386am', '13f5m0', '13gigs', '13nfk1', '13pdrmj', '13pqtw', '13th', '13tolife', '147heu', '14m', '14mph', '14s', '14th', '14yr', '1500ft', '1500rpm', '153bpm', '154upj', '15am', '15c', '15fo4x', '15gb', '15h30', '15mins', '15minutes', '15th', '15yyid', '1600th', '160hp', '16k', '16lbs', '16th', '16urvv', '16w6zv', '170th', '17again', '17cy61', '17jiy8'

- 1) First observation is that we could replace all _ (underscores) with a space.

- 2) Need a better method for removing numbers

- 3) Leave instances of 'ahhhh' in because they are useful for positive and negative tweets

- 4) Things we can't account for: 
    - Spelling mistakes
    - personalised expression

### Solution:
 - First: remove underscores and return the words on their own
 - Return token data with leading and trailing strings removed

In [103]:
def underscoreCleaner(list_words):
    cleanedTokens = []
    for token in list_words:
        token = token.replace('_',' ').strip() # replace underscore and strip leading and trailing spaces for each string
        cleanedTokens.append(token)
    return cleanedTokens

In [104]:
cleanTokens = underscoreCleaner(rubbish)
print(cleanTokens)

['000th', '00am', '00pm', '02mxjj', '05ixbj', '06am', '07am', '07jzs', '07k6e', '07k6x', '07kbq', '07kjr', '07xfs', '07xoh', '07xoi', '08kaifj', '097dfj', '0 o', '0a7v3j', '0f', '0ghz', '0guyoj', '0k', '0l2tsj', '0mqko', '0n', '0rpm', '0ut', '0xhu5j', '0zywwj', '1000th', '1000x', '100greatestgames', '100msg', '100th', '100x', '103f', '104m6wj', '10am', '10days', '10jsepj', '10k', '10m', '10mins', '10mm', '10p', '10pm', '10th', '10ty', '10uemq', '10yr', '1155hours', '117th', '11am', '11e', '11pm', '11th', '11w', '121908inlove', '12am', '12f0y', '12hr', '12k', '12lbs', '12p', '12r3c3', '12s', '12seconds', '12st', '12th', '12yr', '137cty', '1386am', '13f5m0', '13gigs', '13nfk1', '13pdrmj', '13pqtw', '13th', '13tolife', '147heu', '14m', '14mph', '14s', '14th', '14yr', '1500ft', '1500rpm', '153bpm', '154upj', '15am', '15c', '15fo4x', '15gb', '15h30', '15mins', '15minutes', '15th', '15yyid', '1600th', '160hp', '16k', '16lbs', '16th', '16urvv', '16w6zv', '170th', '17again', '17cy61', '17jiy8'

In [105]:
print("Words in newly cleaned Tokens: ", len(cleanTokens))

Words in newly cleaned Tokens:  1859


Now the data has more words in the corpus

- Now we need to remove: '' , from the data, i.e. no space strings
- We can reinstantiate the singleLetter removal method
- We can reinstantiate the numberRemoval method
- However there are still those words that have a number and random letters as a string. A separate method is required to remove those.

In [106]:
def numberRemoval(list_object):
    pattern = '[0-9]'
    tokens = [re.sub(pattern, '', i).strip() for i in list_object] 
    return tokens

In [107]:
cleanTokens = numberRemoval(cleanTokens)
print("Number of Tokens after all Digit removal", len(cleanTokens))

Number of Tokens after all Digit removal 1859


In [108]:
print(cleanTokens)

['th', 'am', 'pm', 'mxjj', 'ixbj', 'am', 'am', 'jzs', 'ke', 'kx', 'kbq', 'kjr', 'xfs', 'xoh', 'xoi', 'kaifj', 'dfj', 'o', 'avj', 'f', 'ghz', 'guyoj', 'k', 'ltsj', 'mqko', 'n', 'rpm', 'ut', 'xhuj', 'zywwj', 'th', 'x', 'greatestgames', 'msg', 'th', 'x', 'f', 'mwj', 'am', 'days', 'jsepj', 'k', 'm', 'mins', 'mm', 'p', 'pm', 'th', 'ty', 'uemq', 'yr', 'hours', 'th', 'am', 'e', 'pm', 'th', 'w', 'inlove', 'am', 'fy', 'hr', 'k', 'lbs', 'p', 'rc', 's', 'seconds', 'st', 'th', 'yr', 'cty', 'am', 'fm', 'gigs', 'nfk', 'pdrmj', 'pqtw', 'th', 'tolife', 'heu', 'm', 'mph', 's', 'th', 'yr', 'ft', 'rpm', 'bpm', 'upj', 'am', 'c', 'fox', 'gb', 'h', 'mins', 'minutes', 'th', 'yyid', 'th', 'hp', 'k', 'lbs', 'th', 'urvv', 'wzv', 'th', 'again', 'cy', 'jiy', 'th', 'zwj', 'x', 's', 'ml', 'aeg', 'hrs', 'kwzh', 'mos', 's', 'th', 's', 's', 'zwd', 'th', 'am', 'biad', 'c', 'direction', 'doeej', 'e', 'fav', 'gb', 'gurq', 'hm', 'hmn', 'hour', 'hr', 'k', 'kg', 'km', 'lol', 'million', 'ntp', 'ofmy', 'pm', 's', 'st', 'tb', 

### Observations

- Do we want to keep 'xx' patterns because they are used in positive tweets
- Can instantiate the remove single character removal method:

In [109]:
def singleLetterRemoval(list_object):
    tokens = []
    for token in list_object:
        if len(token) > 1:
            tokens.append(token)
    return tokens

In [110]:
cleanTokens = singleLetterRemoval(cleanTokens)
print("Number of tokens after Single Letter Removal: ", len(cleanTokens))

Number of tokens after Single Letter Removal:  1643


In [111]:
print(cleanTokens)

['th', 'am', 'pm', 'mxjj', 'ixbj', 'am', 'am', 'jzs', 'ke', 'kx', 'kbq', 'kjr', 'xfs', 'xoh', 'xoi', 'kaifj', 'dfj', 'avj', 'ghz', 'guyoj', 'ltsj', 'mqko', 'rpm', 'ut', 'xhuj', 'zywwj', 'th', 'greatestgames', 'msg', 'th', 'mwj', 'am', 'days', 'jsepj', 'mins', 'mm', 'pm', 'th', 'ty', 'uemq', 'yr', 'hours', 'th', 'am', 'pm', 'th', 'inlove', 'am', 'fy', 'hr', 'lbs', 'rc', 'seconds', 'st', 'th', 'yr', 'cty', 'am', 'fm', 'gigs', 'nfk', 'pdrmj', 'pqtw', 'th', 'tolife', 'heu', 'mph', 'th', 'yr', 'ft', 'rpm', 'bpm', 'upj', 'am', 'fox', 'gb', 'mins', 'minutes', 'th', 'yyid', 'th', 'hp', 'lbs', 'th', 'urvv', 'wzv', 'th', 'again', 'cy', 'jiy', 'th', 'zwj', 'ml', 'aeg', 'hrs', 'kwzh', 'mos', 'th', 'zwd', 'th', 'am', 'biad', 'direction', 'doeej', 'fav', 'gb', 'gurq', 'hm', 'hmn', 'hour', 'hr', 'kg', 'km', 'lol', 'million', 'ntp', 'ofmy', 'pm', 'st', 'tb', 'thing', 'ujzq', 'vn', 'xiz', 'xri', 'zpf', 'zu', 'zyg', 'the', 'km', 'lbs', 'th', 'am', 'dd', 'gf', 'min', 'mins', 'pm', 'somethin', 'th', 'xs',

### Observation:
- Theres only two single letter strings left in corpus
- Next solution to reduce the number of words here is to remove 2 letter strings?

In [112]:
def singleLetterRemoval2(list_object):
    tokens = []
    for token in list_object:
        if len(token) > 2:
            tokens.append(token)
    return tokens

In [122]:
cleanTokens = singleLetterRemoval2(cleanTokens)
print("Number of tokens after single Letter Removal: ", len(cleanTokens))

Number of tokens after single Letter Removal:  360


In [114]:
print(cleanTokens)

['mxjj', 'ixbj', 'jzs', 'kbq', 'kjr', 'xfs', 'xoh', 'xoi', 'kaifj', 'dfj', 'avj', 'ghz', 'guyoj', 'ltsj', 'mqko', 'rpm', 'xhuj', 'zywwj', 'greatestgames', 'msg', 'mwj', 'days', 'jsepj', 'mins', 'uemq', 'hours', 'inlove', 'lbs', 'seconds', 'cty', 'gigs', 'nfk', 'pdrmj', 'pqtw', 'tolife', 'heu', 'mph', 'rpm', 'bpm', 'upj', 'fox', 'mins', 'minutes', 'yyid', 'lbs', 'urvv', 'wzv', 'again', 'jiy', 'zwj', 'aeg', 'hrs', 'kwzh', 'mos', 'zwd', 'biad', 'direction', 'doeej', 'fav', 'gurq', 'hmn', 'hour', 'lol', 'million', 'ntp', 'ofmy', 'thing', 'ujzq', 'xiz', 'xri', 'zpf', 'zyg', 'the', 'lbs', 'min', 'mins', 'somethin', 'yrs', 'life', 'month', 'hrs', 'hrs', 'mer', 'mbps', 'kms', 'ers', 'aam', 'agcth', 'cal', 'day', 'dayme', 'dives', 'dmtn', 'field', 'for', 'gather', 'getha', 'gether', 'ghz', 'goin', 'hear', 'hity', 'hours', 'hpbg', 'hrs', 'kids', 'maro', 'moro', 'morrow', 'moz', 'mrw', 'ndary', 'night', 'nit', 'nite', 'njv', 'nlc', 'nte', 'nyt', 'sljbw', 'stop', 'swv', 'toa', 'twitter', 'uhs', 'u

### Observation:
- double letter strings removed has a much more significant impact in reducing the number of tokens.
- Reason why this is good is because there aren't many words that contribute meaning if they're only two letters.

### Fix Word lengthening

- Next option is to Fix Word Lengthening:


- Then use a spell checker for words in order to remove random patterns of letters.

In [115]:
def reduce_lengthening(list_object):
    tokens = []
    for token in list_object:
        pattern = re.compile(r"(.)\1{2,}")
        token = pattern.sub(r"\1\1", token)
        tokens.append(token)
    return tokens

In [116]:
cleanTokens = reduce_lengthening(cleanTokens)
print(len(cleanTokens)) # Don't expect the number of tokens to reduce or increase

1310


In [117]:
print(cleanTokens)

['mxjj', 'ixbj', 'jzs', 'kbq', 'kjr', 'xfs', 'xoh', 'xoi', 'kaifj', 'dfj', 'avj', 'ghz', 'guyoj', 'ltsj', 'mqko', 'rpm', 'xhuj', 'zywwj', 'greatestgames', 'msg', 'mwj', 'days', 'jsepj', 'mins', 'uemq', 'hours', 'inlove', 'lbs', 'seconds', 'cty', 'gigs', 'nfk', 'pdrmj', 'pqtw', 'tolife', 'heu', 'mph', 'rpm', 'bpm', 'upj', 'fox', 'mins', 'minutes', 'yyid', 'lbs', 'urvv', 'wzv', 'again', 'jiy', 'zwj', 'aeg', 'hrs', 'kwzh', 'mos', 'zwd', 'biad', 'direction', 'doeej', 'fav', 'gurq', 'hmn', 'hour', 'lol', 'million', 'ntp', 'ofmy', 'thing', 'ujzq', 'xiz', 'xri', 'zpf', 'zyg', 'the', 'lbs', 'min', 'mins', 'somethin', 'yrs', 'life', 'month', 'hrs', 'hrs', 'mer', 'mbps', 'kms', 'ers', 'aam', 'agcth', 'cal', 'day', 'dayme', 'dives', 'dmtn', 'field', 'for', 'gather', 'getha', 'gether', 'ghz', 'goin', 'hear', 'hity', 'hours', 'hpbg', 'hrs', 'kids', 'maro', 'moro', 'morrow', 'moz', 'mrw', 'ndary', 'night', 'nit', 'nite', 'njv', 'nlc', 'nte', 'nyt', 'sljbw', 'stop', 'swv', 'toa', 'twitter', 'uhs', 'u

Now correct the mis spelt words:

In [118]:
def spellChecker(list_object):
    tokens = []
    import enchant # This is a python spell checker form the PyEnchant library 
    d = enchant.Dict("en_Uk")
    for token in list_object:
        if d.check(token) == True:
            tokens.append(token)
    return tokens

In [119]:
cleanTokens = spellChecker(cleanTokens)
print(len(cleanTokens))

363


In [120]:
print(cleanTokens)

['rpm', 'days', 'hours', 'lbs', 'seconds', 'gigs', 'mph', 'rpm', 'bpm', 'fox', 'minutes', 'lbs', 'again', 'hrs', 'mos', 'direction', 'hour', 'million', 'thing', 'the', 'lbs', 'min', 'yrs', 'life', 'month', 'hrs', 'hrs', 'cal', 'day', 'dives', 'field', 'for', 'gather', 'hear', 'hours', 'hrs', 'kids', 'morrow', 'night', 'nit', 'stop', 'twitter', 'day', 'rock', 'sec', 'tools', 'hours', 'hrs', 'lbs', 'lin', 'ware', 'secs', 'days', 'hours', 'days', 'ever', 'get', 'give', 'got', 'hours', 'jus', 'jab', 'jam', 'jaw', 'more', 'real', 'ward', 'weeks', 'who', 'wry', 'days', 'had', 'hours', 'jib', 'sec', 'hrs', 'aim', 'hrs', 'mpg', 'day', 'hrs', 'weeks', 'mph', 'followers', 'thanks', 'girls', 'diamond', 'hell', 'addict', 'ahoy', 'aid', 'aloud', 'always', 'angel', 'angel', 'another', 'apple', 'assassin', 'attack', 'august', 'aureole', 'autism', 'avenue', 'awake', 'babe', 'baby', 'bach', 'bee', 'beery', 'believer', 'beyond', 'bill', 'bishop', 'blogs', 'blue', 'bob', 'bones', 'boo', 'bop', 'brad', 'b

### Observation:
- This is seriously effective for reducing the corpus size.
- However we loose exclamations, whereby the length of an exclamation might indicate something is negative or positive.
- We also have the problem of loosing words that tweets describe as negative/positive. These words may be places, names etc. 
- Solution to this might be to POS tag all words, and then spell check all words except for proper nouns, or nouns.

In [131]:
#Spell checker test for lists of list of tokens
def spellChecker(list_object):
    tokens = []
    import enchant # This is a python spell checker form the PyEnchant library 
    d = enchant.Dict("en_Uk")
    for tweet in list_object:
        temp = []
        for token in tweet:
            if d.check(token) == True:
                temp.append(token)
        tokens.append(temp)
    return tokens

In [132]:
test = [['id', 'have', 'responded', 'if', 'i', 'were', 'going'], ['sooo', 'sad', 'i', 'will', 'miss', 'you', 'here', 'in', 'san', 'diego'], ['my', 'boss', 'is', 'bullying', 'me'], ['what', 'interview', 'leave', 'me', 'alone'], ['sons', 'of', 'why', 'couldnt', 'they', 'put', 'them', 'on', 'the', 'releases', 'we', 'already', 'bought']]

In [133]:
tested = spellChecker(test)

In [134]:
print(tested)

[['id', 'have', 'responded', 'if', 'i', 'were', 'going'], ['sad', 'i', 'will', 'miss', 'you', 'here', 'in'], ['my', 'boss', 'is', 'bullying', 'me'], ['what', 'interview', 'leave', 'me', 'alone'], ['sons', 'of', 'why', 'they', 'put', 'them', 'on', 'the', 'releases', 'we', 'already', 'bought']]
