In [4]:
import codecs
import os.path
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\himur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\himur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
class Dictionary:
    fileName = "user_tweets.txt"
    stop_words = set(stopwords.words("english"))
    porterStem = PorterStemmer()                
    dictMap = {}
    
    # alphanumeric
    def getNumericWords(self, text):
        return re.compile('\w+').findall(text)

    # alphabets only
    def getWords(self, text):
        # ref: https://stackoverflow.com/questions/7633274/extracting-words-from-a-string-removing-punctuation-and-returning-a-list-with-s
        # c.isalnum -> alphaNumeric    
        return ''.join((c if c.isalpha() else ' ') for c in text).split()

    # remove url from tweet
    def removeUrl(self, text):
        formText = ''
        textArr = text.split()
        for text in textArr:
            if text.find("http") < 0:
                formText += ' ' + text 
        return formText

    '''
    Reading pure dataset only contain record of tweet per line.
    - Stop words will be filtered in tweet
    - If dataset is small consider stemming, reduce dictionary size
    - Returns array of words, each record is an array
    '''
    def readFile(self, fileName, isStemming=False):
        resultArr = []
        with codecs.open(fileName, "r", encoding="utf-8", errors='ignore') as file:                 
            for line in file.readlines():     
                line = line.lower()
                line = self.removeUrl(line)            
                lineArr = self.getWords(line)            
                filtered_lineArr = [word for word in lineArr if not word in self.stop_words]                        

                if isStemming:
                    stem_lineArr = []
                    for word in filtered_lineArr:
                        stem_lineArr.append(self.porterStem.stem(word))                
                    resultArr.append(stem_lineArr)
                else:                                
                    resultArr.append(filtered_lineArr)            
            return resultArr
    
    def readSentence(self, sentence, isStemming=False):
        resultArr = []
        line = sentence.lower()
        line = self.removeUrl(line)            
        lineArr = self.getWords(line)            
        filtered_lineArr = [word for word in lineArr if not word in self.stop_words]                        

        if isStemming:
            stem_lineArr = []
            for word in filtered_lineArr:
                stem_lineArr.append(self.porterStem.stem(word))                
            resultArr.append(stem_lineArr)
        else:                                
            resultArr.append(filtered_lineArr)            
        return resultArr
    
    '''
    Create dictionary set on words array
    '''
    def createSet(self, tweetWordsArr):
        result = set()
        for words in tweetWordsArr:
            wordsSet = set(words)
            for word in wordsSet:
                result.add(word)        
        return result
    
    # input wordset only, create sets from createSet()
    # all words have to be in lower caps, except of these 2 key words:
    # NONE - first item, UNK - last time
    def createVector(self, tweetWordsSet):
        if len(self.dictMap) == 0:
            tweetWordsSet = list(tweetWordsSet)
            tweetWordsSet.insert(0, "NONE")
            tweetWordsSet.insert(len(tweetWordsSet), "UNK")        
            for idx, word in enumerate(tweetWordsSet):
                self.dictMap[word] = idx        
        return self.dictMap
             

In [12]:
# fileName = "user_tweets.txt"
# dic = Dictionary()
# resultArr = dic.readFile(fileName, True)
# resultSets = dic.createSet(resultArr)
# dicMap = dic.createVector(resultSets)

# print("Dictionary count: {}".format(len(resultSets)))
# print(dicMap)

# dic.readSentence("Please be informed that there will be no train between woodlands and amk", True)

[['pleas', 'inform', 'train', 'woodland', 'amk']]