In [1]:
import bs4 as bs
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import glob    

In [2]:
def doc_cleaner(doc):
    '''
    Clean and preprocess a document.
    
    1. Use regex to remove all special characters (only keep letters)
    2. Make strings to lower case and tokenize / word split reviews
    3. Remove English stopwords
    
    Return a list of words
    '''
    doc = re.sub("[^a-zA-Z]", " ",doc)
    doc = doc.lower().split()
    eng_stopwords = stopwords.words("english")
    for stopword in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']:
        eng_stopwords.append(stopword)
    doc = [w for w in doc if not w in eng_stopwords]
    ps = PorterStemmer()
    ps_stems = []
    for word in doc:
        ps_stems.append(ps.stem(word))    
    return(ps_stems)

In [4]:
## Clean words in each document while keep every sentences
txt_files = glob.glob("nips12we25/*.txt")
corpus = []
for file in sorted(txt_files):
    doc = []
    with open(file, 'rt',encoding = "ISO-8859-1") as f:
        body = False
        for line in f:
            line = line.strip()
            if line == 'Abstract':
                body = True
            if line == 'References':
                body = False
            if body:
                if line[-1] == '-':
                    line = line.strip('-')
                    doc.append(line)
                else: 
                    line += ' '
                    doc.append(line)
    doc = ''.join(doc)
    doc += ' '
    corpus.append(doc)
corpus = ''.join(corpus)

In [5]:
corpus = corpus.split('.')
for i, line in enumerate(corpus):
    corpus[i] = doc_cleaner(line)
    


In [7]:
from gensim.models import word2vec

In [8]:
# Set values for various parameters
num_features = 25    # Word vector dimensionality                      
min_word_count = 0   # ignore all words with total frequency lower than this                       
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    


# Initialize and train the model (this will take some time)

print("Training word2vec model... ")
model = word2vec.Word2Vec(corpus, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context)


# save the model for later use. You can load it later using Word2Vec.load()
model_name = "25dim_0minwords_10context"
model.save(model_name)

Training word2vec model... 


In [9]:
vocab_tmp = list(model.wv.vocab)
print('Vocab length:',len(vocab_tmp))

Vocab length: 9712


In [13]:
word_vectors = model.wv
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

fname = get_tmpfile("vectors.kv")
word_vectors.save(fname)


In [10]:
model.wv.save_word2vec_format('word_vectors_25.txt', binary=False)

In [11]:
import numpy as np

In [15]:
word_vectors = KeyedVectors.load(fname, mmap='r')
txt_files_clean = glob.glob("nipstxt/nips12/clean_*.txt")
for file in sorted(txt_files_clean):
    doc_word_embed = np.array ([0]*25)
    with open(file, 'rt',encoding = "ISO-8859-1") as f:
        for line in f:
            doc_words = line.split()
            for word in doc_words:
                doc_word_embed = np.vstack([doc_word_embed, word_vectors[word]])
    np.savetxt('nips12we25/'+'wordembed_'+file[-8:-4]+'.txt', doc_word_embed[1:,:], delimiter=' ')

In [16]:
word_vectors = KeyedVectors.load(fname, mmap='r')
txt_files_clean = glob.glob("nipstxt/nips12/short_*.txt")
for file in sorted(txt_files_clean):
    doc_word_embed = np.array ([0]*25)
    with open(file, 'rt',encoding = "ISO-8859-1") as f:
        for line in f:
            doc_words = line.split()
            for word in doc_words:
                doc_word_embed = np.vstack([doc_word_embed, word_vectors[word]])
    np.savetxt('nips12we25/'+'short_wordembed_'+file[-8:-4]+'.txt', doc_word_embed[1:,:], delimiter=' ')

In [3]:
txt_files = glob.glob("nipstxt/nips12/*.txt")
for file in txt_files:
    print (file)

nipstxt/nips12/doc_wordID0370.txt
nipstxt/nips12/clean_0949.txt
nipstxt/nips12/short_12_0977.txt
nipstxt/nips12/short_12_0963.txt
nipstxt/nips12/doc_wordID0827.txt
nipstxt/nips12/doc_wordID0199.txt
nipstxt/nips12/doc_short_wID_12_0747.txt
nipstxt/nips12/0761.txt
nipstxt/nips12/0775.txt
nipstxt/nips12/clean_0356.txt
nipstxt/nips12/clean_0342.txt
nipstxt/nips12/doc_short_wID_12_0223.txt
nipstxt/nips12/doc_short_wID_12_0237.txt
nipstxt/nips12/short_12_0803.txt
nipstxt/nips12/short_12_0624.txt
nipstxt/nips12/doc_wordID0589.txt
nipstxt/nips12/short_12_0631.txt
nipstxt/nips12/0589.txt
nipstxt/nips12/short_12_0157.txt
nipstxt/nips12/short_12_0143.txt
nipstxt/nips12/doc_short_wID_12_0949.txt
nipstxt/nips12/doc_wordID0775.txt
nipstxt/nips12/doc_wordID0761.txt
nipstxt/nips12/short_12_0209.txt
nipstxt/nips12/0199.txt
nipstxt/nips12/0827.txt
nipstxt/nips12/short_12_0547.txt
nipstxt/nips12/clean_0237.txt
nipstxt/nips12/clean_0223.txt
nipstxt/nips12/clean_0747.txt
nipstxt/nips12/0370.txt
nipstxt/nip