In [1]:
import bs4 as bs
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# eng_stopwords = stopwords.words('english')


In [2]:
def doc_cleaner(doc):
    '''
    Clean and preprocess a document.
    
    1. Use regex to remove all special characters (only keep letters)
    2. Make strings to lower case and tokenize / word split reviews
    3. Remove English stopwords
    '''
    doc = re.sub("[^a-zA-Z]", " ",doc)
    doc = doc.lower().split()
    eng_stopwords = stopwords.words("english")
    for stopword in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']:
        eng_stopwords.append(stopword)
    doc = [w for w in doc if not w in eng_stopwords]
    ps = PorterStemmer()
    ps_stems = []
    for word in doc:
        ps_stems.append(ps.stem(word))
    doc = ' '.join(doc)
    
    return(doc)

In [3]:
## Clean words in each document
import glob    
txt_files = glob.glob("nipstxt/nips12/*.txt")
for file in sorted(txt_files):
    doc =''
    with open(file, 'rt',encoding = "ISO-8859-1") as f:
        body = False
        for line in f:
            line.strip()
            if line == 'Abstract':
                body = True
            if line == 'Reference':
                body = False
            if body:
                line.strip()
                if line[-1] == '-':
                    line = line.strip('-')
                    doc += line
                else: 
                    line += ' '
                    doc += line
        f.close()
    doc = doc_cleaner(doc)
    with open('nipstxt/nips12/'+'clean_'+file[-8:-4]+'.txt', 'a') as f2:
        f2.write(doc)  
        f2.close()

In [4]:
# nltk.download()

In [9]:
# Get a vocabulary set that contains uniqe words
txt_files_clean = glob.glob("nipstxt/nips12/clean_*.txt")
vocabulary = set()
for file in sorted(txt_files_clean):
    with open(file, 'rt',encoding = "ISO-8859-1") as f:
        for line in f:
            doc_words = line.split()
            for word in doc_words:
                vocabulary.add(word)
id = 0
wordToID = {}
for key in vocabulary:
    wordToID[key] = id
    id += 1
with open('wordToID', 'a') as f2:
    for key, value in wordToID.items():
        f2.write(str(key)+' '+str(value)+'\n')
f2.close()

In [10]:
print (len(vocabulary))

13944


In [11]:
## Change document with word ID 
for file in sorted(txt_files_clean):
    with open(file, 'rt',encoding = "ISO-8859-1") as f:
        doc_wordID = ''
        for line in f:
            doc_words = line.split()
            for word in doc_words:
                doc_wordID += str(wordToID[word])
                doc_wordID += ' '
        with open('nipstxt/nips12/'+'doc_wordID'+file[-8:-4]+'.txt', 'a') as f2:
            f2.write(doc_wordID)  
            f2.close()

{'min': 0,
 'embraces': 1,
 'labeled': 2,
 'representing': 3,
 'finally': 4,
 'models': 5,
 'executed': 6,
 'optimizes': 7,
 'replay': 8,
 'rewards': 9,
 'obtain': 10,
 'trivially': 11,
 'correctly': 12,
 'parameter': 13,
 'database': 14,
 'current': 15,
 'oet': 16,
 'right': 17,
 'demonstrate': 18,
 'value': 19,
 'equal': 20,
 'solves': 21,
 'either': 22,
 'functions': 23,
 'part': 24,
 'top': 25,
 'times': 26,
 'divergence': 27,
 'history': 28,
 'realvalued': 29,
 'computed': 30,
 'knowledge': 31,
 'upper': 32,
 'normalize': 33,
 'grows': 34,
 'state': 35,
 'illustrate': 36,
 'approximated': 37,
 'coordinates': 38,
 'floor': 39,
 'legitimacy': 40,
 'abstract': 41,
 'vicinity': 42,
 'observe': 43,
 'approximately': 44,
 'finds': 45,
 'domains': 46,
 'argmaxq': 47,
 'trajectories': 48,
 'oo': 49,
 'allowed': 50,
 'recall': 51,
 'little': 52,
 'already': 53,
 'change': 54,
 'robotic': 55,
 'lllllllllnillllllllllllllllllllll': 56,
 'reach': 57,
 'employed': 58,
 'gray': 59,
 'rotational'