In [36]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle
import random
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import os
# Maximum / minimum document frequency
 # choose desired value for min_df

# Read data
print('reading text file...')
data_file = '../ndc.txt'
with open(data_file, 'r') as f:
    docs = f.readlines()

# Create count vectorizer
print('counting document frequency of words...')
cvectorizer = CountVectorizer( stop_words=None)
cvz = cvectorizer.fit_transform(docs).sign()

# Get vocabulary
print('building the vocabulary...')
sum_counts = cvz.sum(axis=0)
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0,v]
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
del cvectorizer
print('  initial vocabulary size: {}'.format(v_size))

# Sort elements in vocabulary
idx_sort = np.argsort(sum_counts_np)
vocab_aux = [id2word[idx_sort[cc]] for cc in range(v_size)]

# Filter out stopwords (if any)
# vocab_aux = [w for w in vocab_aux if w not in stops]
print('  vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))
print('  vocabulary after removing stopwords: {}'.format(len(vocab_aux)))

# Create dictionary and inverse dictionary
vocab = vocab_aux
del vocab_aux
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])

# Split in train/test/valid
print('tokenizing documents and splitting into train/test/valid...')
num_docs = cvz.shape[0]
trSize = int(np.floor(0.85*num_docs))
tsSize = int(np.floor(0.10*num_docs))
vaSize = int(num_docs - trSize - tsSize)
del cvz
idx_permute = np.random.permutation(num_docs).astype(int)

# Remove words not in train_data
vocab = list(set([w for idx_d in range(trSize) for w in docs[idx_permute[idx_d]].split() if w in word2id]))
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])
print('  vocabulary after removing words not in train: {}'.format(len(vocab)))

docs_tr = [[word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
docs_ts = [[word2id[w] for w in docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(tsSize)]
docs_va = [[word2id[w] for w in docs[idx_permute[idx_d+trSize+tsSize]].split() if w in word2id] for idx_d in range(vaSize)]
del docs

print('  number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize))
print('  number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize))
print('  number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize))

# Remove empty documents
print('removing empty documents...')

def remove_empty(in_docs):
    return [doc for doc in in_docs if doc!=[]]

docs_tr = remove_empty(docs_tr)
docs_ts = remove_empty(docs_ts)
docs_va = remove_empty(docs_va)

# Remove test documents with length=1
docs_ts = [doc for doc in docs_ts if len(doc)>1]

# Split test set in 2 halves
print('splitting test documents in 2 halves...')
docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]

# Getting lists of words and doc_indices
print('creating lists of words...')

def create_list_words(in_docs):
    return [x for y in in_docs for x in y]

words_tr = create_list_words(docs_tr)
words_ts = create_list_words(docs_ts)
words_ts_h1 = create_list_words(docs_ts_h1)
words_ts_h2 = create_list_words(docs_ts_h2)
words_va = create_list_words(docs_va)

print('  len(words_tr): ', len(words_tr))
print('  len(words_ts): ', len(words_ts))
print('  len(words_ts_h1): ', len(words_ts_h1))
print('  len(words_ts_h2): ', len(words_ts_h2))
print('  len(words_va): ', len(words_va))

# Get doc indices
print('getting doc indices...')

def create_doc_indices(in_docs):
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
    return [int(x) for y in aux for x in y]

doc_indices_tr = create_doc_indices(docs_tr)
doc_indices_ts = create_doc_indices(docs_ts)
doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
doc_indices_va = create_doc_indices(docs_va)

print('  len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
print('  len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
print('  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
print('  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
print('  len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))

# Number of documents in each set
n_docs_tr = len(docs_tr)
n_docs_ts = len(docs_ts)
n_docs_ts_h1 = len(docs_ts_h1)
n_docs_ts_h2 = len(docs_ts_h2)
n_docs_va = len(docs_va)

reading text file...
counting document frequency of words...
building the vocabulary...
  initial vocabulary size: 4203
  vocabulary size after removing stopwords from list: 4203
  vocabulary after removing stopwords: 4203
tokenizing documents and splitting into train/test/valid...
  vocabulary after removing words not in train: 4148
  number of documents (train): 42677 [this should be equal to 42677]
  number of documents (test): 5020 [this should be equal to 5020]
  number of documents (valid): 2512 [this should be equal to 2512]
removing empty documents...
splitting test documents in 2 halves...
creating lists of words...
  len(words_tr):  3029679
  len(words_ts):  360517
  len(words_ts_h1):  179030
  len(words_ts_h2):  181487
  len(words_va):  175087
getting doc indices...
  len(np.unique(doc_indices_tr)): 42677 [this should be 42677]
  len(np.unique(doc_indices_ts)): 5004 [this should be 5004]
  len(np.unique(doc_indices_ts_h1)): 5004 [this should be 5004]
  len(np.unique(doc_indi

In [34]:
print('reading text file...')
data_file = '../ndc.txt'
with open(data_file, 'r') as f:
    docs = f.readlines()

# Create count vectorizer
print('counting document frequency of words...')
cvectorizer = CountVectorizer( stop_words=None)
cvz = cvectorizer.fit_transform(docs).sign()

# Get vocabulary
print('building the vocabulary...')
sum_counts = cvz.sum(axis=0)
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0,v]
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
print('  initial vocabulary size: {}'.format(v_size))

reading text file...
counting document frequency of words...
building the vocabulary...
  initial vocabulary size: 4203


{'63323017302': 2860,
 '63323038810': 2896,
 '88222033': 3951,
 '456068801': 1313,
 '61787006204': 2776,
 '24159601': 603,
 '2735501': 731,
 '74407532': 3499,
 '58177000104': 2440,
 '4006850': 1060,
 '54872625': 2257,
 '9337502': 4129,
 '31867412': 838,
 '74434113': 3510,
 '45050130': 1295,
 '641040025': 2991,
 '51079001920': 1609,
 '34120081': 960,
 '781188313': 3755,
 '49343041': 1455,
 '173099156': 398,
 '63323022110': 2866,
 '338001702': 874,
 '173047001': 352,
 '61958040101': 2780,
 '87665201': 3929,
 '11980002515': 85,
 '338004902': 888,
 '2831501': 748,
 '51079059820': 1740,
 '51079069020': 1762,
 '469061711': 1352,
 '56017275': 2381,
 '338055002': 914,
 '54829725': 2187,
 '56016975': 2379,
 '56017075': 2380,
 '469065773': 1354,
 '4026001': 1076,
 '4025901': 1075,
 '17314931102': 404,
 '904107061': 3998,
 '88120806': 3944,
 '4003822': 1057,
 '74792201': 3591,
 '338268975': 943,
 '338004304': 883,
 '4029809': 1084,
 '338008504': 896,
 '74131230': 3404,
 '8084199': 3821,
 '1771400

In [24]:
idx_sort = np.argsort(sum_counts_np)
vocab_aux = [id2word[idx_sort[cc]] for cc in range(v_size)]

# Filter out stopwords (if any)
# vocab_aux = [w for w in vocab_aux if w not in stops]
print('  vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))
print('  vocabulary after removing stopwords: {}'.format(len(vocab_aux)))

# Create dictionary and inverse dictionary
vocab = vocab_aux
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])

  vocabulary size after removing stopwords from list: 4203
  vocabulary after removing stopwords: 4203


In [26]:
docs_tr = [[word2id[w] for w in docs[idx_permute[idx_d]].split() if w in word2id] for idx_d in range(trSize)]
docs_ts = [[word2id[w] for w in docs[idx_permute[idx_d+trSize]].split() if w in word2id] for idx_d in range(tsSize)]
docs_va = [[word2id[w] for w in docs[idx_permute[idx_d+trSize+tsSize]].split() if w in word2id] for idx_d in range(vaSize)]

{'36000000124': 0,
 '8117901': 1,
 '51079012420': 2,
 '182070389': 3,
 '182070201': 4,
 '83005932': 5,
 '83006132': 6,
 '4082009': 7,
 '17478071012': 8,
 '17478029111': 9,
 '17478021612': 10,
 '17478020510': 11,
 '61570041451': 12,
 '51079009120': 13,
 '61570030031': 14,
 '61570018601': 15,
 '8413901': 16,
 '74416901': 17,
 '51079029072': 18,
 '51079029220': 19,
 '51079032301': 20,
 '61147800903': 21,
 '74405201': 22,
 '85057102': 23,
 '60977001701': 24,
 '60574310101': 25,
 '85117902': 26,
 '17478006412': 27,
 '7440106': 28,
 '81027455': 29,
 '62141101': 30,
 '68094021762': 31,
 '63323018610': 32,
 '186430100': 33,
 '54873525': 34,
 '50242013460': 35,
 '186190601': 36,
 '50383063550': 37,
 '781328379': 38,
 '50419035703': 39,
 '3218710': 40,
 '68180020203': 41,
 '63304065325': 42,
 '81027255': 43,
 '63304055201': 44,
 '74444005': 45,
 '68258912401': 46,
 '50458057810': 47,
 '3256016': 48,
 '50458058601': 49,
 '63010001030': 50,
 '63004773101': 51,
 '327001105': 52,
 '185027101': 53,
 

In [30]:
docs[idx_permute[1]]

'51991045757.0 51079025520.0 406055262.0 54162055007.0 904516561.0 574705050.0 456066270.0 409490234.0 55390000401.0 51079000220.0 182864389.0 456066270.0 338004902.0 338004902.0 409672924.0 51079025520.0 51079025520.0 68462014645.0 51079025520.0 51079025520.0 641037625.0 338355248.0 338500241.0 338355248.0 517570425.0 63739002401.0 51079000220.0 51079025520.0 58177032304.0 406055262.0 51079000220.0 904516561.0 51079090620.0 904224461.0 51079025520.0 51079025520.0 51079025520.0 409176230.0 406055262.0 56017275.0 121043130.0 904777261.0 121043130.0 536338101.0 51079097220.0 10019016312.0 338100702.0 172375810.0 63323026201.0 58177029311.0 338001702.0 64764015105.0 182116189.0 51079090620.0 56017275.0 338100702.0 406051262.0 409176230.0 51079038620.0 904404073.0 310075590.0 93521193.0 456066270.0 6473900.0\n'