In [1]:
import glob, re, nltk
from collections import Counter
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk import word_tokenize
from math import log
from math import sqrt
from nltk.corpus import stopwords
import numpy as np
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('stopwords')
# haven't done spell checking

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
word_doc = defaultdict(Counter)
doc_word = defaultdict(Counter)
#read stop words
stop_words = set()
f = open('stops.txt', 'r')
for line in f:
    stop_words.add(line.strip())
f.close()
for w in stopwords.words('english'):
    stop_words.add(w)
print(len(stop_words), stop_words)

621 {'known', 'clearly', 'like', 'specify', 'maybe', 'okay', '}', 'where', 'anywhere', 'knows', 'regards', "shan't", 'for', 'x', 'about', 'gives', 'all', 'n', '.', 'next', 'somewhere', 'little', 'noone', 'would', 'soon', 'un', 'looks', 'becoming', 'somehow', 'below', 'thorough', 'none', 'wasn', 'wherever', 'inner', 'unlikely', "we're", 'whether', 'on', 'better', 'awfully', 'use', 'nothing', 'example', "there's", 'am', 'sub', '=', 'thats', 'wish', 'ok', 'regarding', 'thoroughly', '|', 'whither', 'hasn', 'indicates', 'and', 'specified', 'nowhere', 'aside', 'associated', 'still', 'que', '~', 'without', 'com', 'now', 'or', 'appear', 'him', 'taken', 'ask', 'et', 'necessary', 'value', 'those', '$', 'r', 'hi', 'everybody', 'sensible', 'could', 'only', 'themselves', 'to', 'contain', 'placed', 'each', 'thank', 'anyway', 'beforehand', 'theirs', 'toward', 'their', 'come', 'this', 'we', 'amongst', 'furthermore', 'hers', 'see', '_', 'got', 'least', 'been', 'changes', 'otherwise', 'therein', 'same',

In [3]:
def df(term):
    return len(word_doc[term])

def idf(term):
    return log(len(doc_word) / df(term), 10)

def prefixReplace(match):
    prefix, stem = match.group(1), match.group(2)
    temp = prefix + stem
    if not stem in stop_words:
        temp += ' ' + stem
    return temp

def hyphenReplace(match):
    temp = match.group()
    li = temp.split('-')
    temp = temp.replace('-', '')
    for item in li:
        if not item in stop_words:
            temp += ' ' + item
    return temp

def writeToFile(index_table):
    # write to output
    output_file = open('single_term_idx_table.txt', 'w')
    for key in sorted(index_table.keys()):
        output_file.write(' '.join(str(x) for x in key)+' '+str(index_table[key])+'\n')
    output_file.close()

def createIndexTable(input_path, doc_type):
    vocab = set()
    # format input_path to ./input_path/
    if input_path[0] == '/':
        input_path = '.' + input_path
    if input_path[-1] != '/':
        input_path += '/'
    wnl = WordNetLemmatizer()
    stemmer = PorterStemmer()
    files = glob.glob(input_path+'*') # grab all the files under path
    print(len(files))
    get_num = re.compile(r'(\d+)')
    p_prefix = re.compile(r'\b(a|an|ante|anti|auto|circum|co|com|con|contra|contro|de|dis|en|em|ex|extra|fore|hetero|homo|homeo|hyper|il|im|in|ir|inter|intra|intro|macro|micro|mid|mis|mono|non|omni|over|post|pre|pro|re|semi|sub|super|sym|syn|trans|tri|un|under|uni)-([a-z])+\b', re.I)
    p_hyphen = re.compile(r'\b(\w+-)+\w+\b')
    
    for file in files:
        f = open(file, 'r')
        docno = (doc_type,int(get_num.search(file).group(1)))
        for line in f:
            line = line.strip()
            line = line.lower()
            # expand stem with hyphen prefix
            line = p_prefix.sub(prefixReplace, line)
            # expand hyphenated word
            line = p_hyphen.sub(hyphenReplace, line)
            line = line.replace(':', ' ')
            line = line.replace('\'s', '')
            line = line.replace(',', ' ')
            line = line.replace('.', ' ')
            li = word_tokenize(line)
            li = [wnl.lemmatize(x) for x in li]
            li = [stemmer.stem(x) for x in li]
            for word in li:
                if not word in stop_words:
                    # build single term idx
                    word_doc[word][docno] += 1
                    doc_word[docno][word] += 1
                    vocab.add(word)
    return vocab

In [4]:
vocab_train_description = createIndexTable('../data/descriptions_train', 'train')
vocab_test_description = createIndexTable('../data/descriptions_test', 'test')
vocab_list = sorted(vocab_train_description.intersection(vocab_test_description))
print('num of vocab:', len(vocab_list))
train_d = []
test_d = []
for i in range(10000):
    tup = ('train', i)
    train_d.append([doc_word[tup][w] * idf(w) for w in vocab_list])
for i in range(2000):
    tup = ('test', i)
    test_d.append([doc_word[tup][w] * idf(w) for w in vocab_list])
np.save('bagOfWord_intersection_train_description.npy', train_d)
np.save('bagOfWord_intersection_test_description.npy', test_d)

print(len(train_d), len(train_d[0]))
print(len(test_d), len(test_d[0]))

print('finish description bag of word with idf')

10000
2000
num of vocab: 2776
10000 2776
2000 2776
finish description bag of word with idf


In [5]:
train_d_no_idf = []
test_d_no_idf = []
for i in range(10000):
    tup = ('train', i)
    train_d_no_idf.append([doc_word[tup][w] for w in vocab_list])
for i in range(2000):
    tup = ('test', i)
    test_d_no_idf.append([doc_word[tup][w] for w in vocab_list])
np.save('bagOfWord_intersection_train_description_no_idf.npy', train_d_no_idf)
np.save('bagOfWord_intersection_test_description_no_idf.npy', test_d_no_idf)

print(len(train_d_no_idf), len(train_d_no_idf[0]))
print(len(test_d_no_idf), len(test_d_no_idf[0]))

print('finish description bag of word without idf')

10000 2776
2000 2776
finish description bag of word without idf


In [6]:
word_doc.clear()
doc_word.clear()

In [7]:
vocab_train_tag = createIndexTable('../data/tags_train', 'train')
vocab_test_tag = createIndexTable('../data/tags_test', 'test')
vocab_list = sorted(vocab_train_tag.intersection(vocab_test_tag))
print('num of vocab:', len(vocab_list))
train_t = []
test_t = []
for i in range(10000):
    tup = ('train', i)
    train_t.append([doc_word[tup][w] * idf(w) for w in vocab_list])
for i in range(2000):
    tup = ('test', i)
    test_t.append([doc_word[tup][w] * idf(w) for w in vocab_list])

train_t = np.array(train_t)
test_t = np.array(test_t)
np.save('bagOfWord_intersection_train_tag.npy', train_t)
np.save('bagOfWord_intersection_test_tag.npy', test_t)

print(len(train_t), len(train_t[0]))
print(len(test_t), len(test_t[0]))

print('finish tag bag of word with idf')

10000
2000
num of vocab: 102
10000 102
2000 102
finish tag bag of word with idf


In [8]:
train_t_no_idf = []
test_t_no_idf = []
for i in range(10000):
    tup = ('train', i)
    train_t_no_idf.append([doc_word[tup][w] for w in vocab_list])
for i in range(2000):
    tup = ('test', i)
    test_t_no_idf.append([doc_word[tup][w] for w in vocab_list])

train_t_no_idf = np.array(train_t_no_idf)
test_t_no_idf = np.array(test_t_no_idf)
np.save('bagOfWord_intersection_train_tag_no_idf.npy', train_t_no_idf)
np.save('bagOfWord_intersection_test_tag_no_idf.npy', test_t_no_idf)

print(len(train_t_no_idf), len(train_t_no_idf[0]))
print(len(test_t_no_idf), len(test_t_no_idf[0]))

print('finish tag bag of word without idf')

10000 102
2000 102
finish tag bag of word without idf


In [9]:
word_doc.clear()
doc_word.clear()

In [10]:
def empty_vec(data):
    for i in range(len(data)):
        if len(set(data[i])) == 1:
            print(i, end=' ')

print('\ntrain description empty vec')
empty_vec(train_d)
print('\ntest description empty vec')
empty_vec(test_d)
print('\ntrain tag vec')
empty_vec(train_t)
print('\ntest tag vec')
empty_vec(test_t)
print()


train description empty vec

test description empty vec

train tag vec
12 46 656 677 1312 1337 1364 1374 1410 1547 1615 1807 1831 2191 2253 2286 2375 2407 2772 2865 2911 3068 3327 3345 3388 3434 3437 3491 3554 3602 3668 3778 3861 3913 4089 4152 4166 4386 4417 4484 4633 4726 5054 5165 5191 5363 5513 5722 5805 5824 6094 6522 6678 6715 6776 6832 6966 7192 7318 7410 7849 7888 7973 8031 8080 8122 8329 8371 8447 8606 8649 8713 8732 8835 9103 9142 9305 9491 9566 9620 9713 9809 9911 9933 
test tag vec
452 607 763 784 860 1027 1030 1033 1059 1208 1340 1421 1437 1475 1574 1648 


In [11]:
word_doc.clear()
doc_word.clear()
createIndexTable('../data/descriptions_train', 'train')
createIndexTable('../data/descriptions_test', 'test')
word_freq = Counter()
for w in word_doc.keys():
    word_freq[w] += sum(word_doc[w].values())
print(len(word_freq))
print(word_freq.most_common()[-5000:])

10000
2000
7077
[('gun', 7), ('shaggi', 7), ('jam', 7), ('mango', 7), ('stylish', 7), ('bone', 7), ('religi', 7), ('vine', 7), ('backseat', 7), ('showcas', 7), ('join', 7), ('mural', 7), ('raincoat', 7), ('assembl', 7), ('forehead', 7), ('bidet', 7), ('wok', 7), ('nearli', 7), ('inflat', 7), ('goali', 7), ('marsh', 7), ('portabl', 7), ('backward', 7), ('sailor', 7), ('current', 7), ('lite', 7), ('broadway', 7), ('secur', 7), ('safe', 7), ('refridger', 7), ('pilot', 7), ('tortilla', 7), ('overturn', 7), ('vertic', 7), ('stew', 7), ('circu', 7), ('morn', 7), ('fuzzi', 7), ('beani', 7), ('rundown', 7), ('fourth', 7), ('exot', 7), ('rooftop', 7), ('favorit', 7), ('wakeboard', 7), ('cloudless', 7), ('quaint', 7), ('forc', 7), ('weigh', 7), ('syrup', 7), ('runner', 7), ('congreg', 7), ('conveyor', 7), ('motel', 7), ('hairdryer', 7), ('wrestler', 7), ('hiker', 7), ('real', 7), ('focus', 7), ('parakeet', 7), ('dial', 7), ('shutter', 7), ('puffi', 7), ('pedal', 7), ('bad', 7), ('offici', 7), ('

In [12]:
freq_list = list(word_freq.values())
for i in range(1, 11):
    print('word appear '+str(i)+' times:', freq_list.count(i))

word appear 1 times: 2713
word appear 2 times: 909
word appear 3 times: 457
word appear 4 times: 396
word appear 5 times: 233
word appear 6 times: 187
word appear 7 times: 126
word appear 8 times: 126
word appear 9 times: 102
word appear 10 times: 78
