In [34]:
import codecs
import os.path
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NielPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NielPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [90]:
class Dictionary:
    fileName = "user_tweets.txt"
    stop_words = set(stopwords.words("english"))
    porterStem = PorterStemmer()
    
    # words include numbers
    def getNumericWords(self, text):
        return re.compile('\w+').findall(text)

    # words doesn't include numbers
    def getWords(self, text):
        # ref: https://stackoverflow.com/questions/7633274/extracting-words-from-a-string-removing-punctuation-and-returning-a-list-with-s
        # c.isalnum -> alphaNumeric    
        return ''.join((c if c.isalpha() else ' ') for c in text).split()

    # remove url from tweet
    def removeUrl(self, text):
        formText = ''
        textArr = text.split()
        for text in textArr:
            if text.find("http") < 0:
                formText += ' ' + text 
        return formText


    '''
    Reading pure dataset only contain record of tweet per line.
    - Stop words will be filtered in tweet
    - If dataset is small consider stemming, reduce dictionary size
    - Returns array of words, each record is an array
    '''
    def readFile(self, fileName, isStemming=False):
        resultArr = []
        with codecs.open(fileName, "r", "utf-8") as file:                 
            for line in file.readlines():     
                line = line.lower()
                line = self.removeUrl(line)            
                lineArr = self.getWords(line)            
                filtered_lineArr = [word for word in lineArr if not word in stop_words]                        

                if isStemming:
                    stem_lineArr = []
                    for word in filtered_lineArr:
                        stem_lineArr.append(porterStem.stem(word))                
                    resultArr.append(stem_lineArr)
                else:                                
                    resultArr.append(filtered_lineArr)

            return resultArr

    '''
    Create dictionary set on words array
    '''
    def createSet(self, tweetWordsArr):
        result = set()
        for words in tweetWordsArr:
            wordsSet = set(words)
            for word in wordsSet:
                result.add(word)        
        return result
    

In [91]:
dic = Dictionary()
resultArr = dic.readFile(fileName, True)
resultSets = dic.createSet(resultArr)

print(len(resultSets))
print(resultSets)

1623
{'investig', 'energi', 'mean', 'onsen', 'aircond', 'malaysia', 'survey', 'afraid', 'perfectli', 'ceas', 'blink', 'captain', 'sail', 'bigger', 'state', 'slow', 'prob', 'complaint', 'occur', 'yt', 'routin', 'mouth', 'ceo', 'dosent', 'harbourfront', 'form', 'paper', 'hell', 'pic', 'piec', 'wth', 'pretti', 'class', 'defect', 'unresolv', 'nvr', 'cross', 'hr', 'desmond', 'tanah', 'tgif', 'fire', 'sad', 'lack', 'envi', 'life', 'frequent', 'monthli', 'got', 'alr', 'slightli', 'abit', 'fi', 'solv', 'friday', 'buona', 'qn', 'reason', 'design', 'bu', 'solat', 'ehhh', 'best', 'peopl', 'gjcsingapor', 'inform', 'noth', 'justifi', 'knowledg', 'master', 'hahaha', 'altern', 'leav', 'sigh', 'mass', 'volum', 'asid', 'trust', 'either', 'marymount', 'unlik', 'lol', 'commenc', 'area', 'boulevard', 'hour', 'isnt', 'shud', 'ga', 'miss', 'declar', 'mani', 'light', 'combo', 'damn', 'rest', 'regard', 'must', 'siol', 'august', 'night', 'win', 'kio', 'improvemen', 'social', 'premium', 'explain', 'note', 'deli