In [40]:
#Importing required libraries
import os
import random
import numpy as np
import pickle as pkl
import scipy.sparse as sp
from math import log
from sklearn import svm
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

import sys
sys.path.append('./')


In [41]:
#Initializing variables
dataset = 'data'

word_embeds_dim = 300
word2vec_map = {}

#shulffing
doc_names = []
doc_train = []
doc_test = []

In [42]:
#Splitting data into train and test data files
with open('./cleaned_data2/' + dataset + '/' + dataset + '.txt', 'r', encoding="utf8") as fp:
    lines = fp.readlines()
    for l in lines:
        doc_names.append(l.strip())
        col = l.split(",")
        if col[3].find('test') != -1:
            doc_test.append(l.strip())
        elif col[3].find('train') != -1:
            doc_train.append(l.strip())


In [43]:
doc_names[0]

'\ufeff0,Safe and effective ,1,train'

In [44]:
doc_train[0]

'\ufeff0,Safe and effective ,1,train'

In [45]:
doc_cont_list = []
with open('./cleaned_data2/' + dataset + '/' + dataset + '_clean.txt', 'r') as fp:
    lines = fp.readlines()
    for l in lines:
        doc_cont_list.append(l.strip())

In [46]:
#shuffling train data
train_ids = []
for train_name in doc_train:
    id = doc_names.index(train_name)
    train_ids.append(id)
random.shuffle(train_ids)

In [47]:
#getting all train ids in separate file
train_ids_st = '\n'.join(str(i) for i in train_ids)
with open('./cleaned_data2/' + dataset + '/graph/' + dataset + '.train.index', 'w') as fp:
    fp.write(train_ids_st)


In [48]:
#shuffling test data
test_ids = []
for test_name in doc_test:
    id = doc_names.index(test_name)
    test_ids.append(id)

random.shuffle(test_ids)

#getting all train ids in separate file
test_ids_str = '\n'.join(str(i) for i in test_ids)
with open('./cleaned_data2/' + dataset + '/graph/' + dataset + '.test.index', 'w') as fp:
    fp.write(test_ids_str)


In [49]:
#merging train and test ids
total_ids = train_ids + test_ids
print("dataset size {}".format(len(total_ids)))

print("adding up...")
shuffled_doc_names = []
shuffled_doc_words = []
for id_ in total_ids:
    shuffled_doc_names.append(doc_names[int(id_)])
    shuffled_doc_words.append(doc_cont_list[int(id_)])
shuffled_doc_names_str = '\n'.join(shuffled_doc_names)
shuffled_doc_words_str = '\n'.join(shuffled_doc_words)


dataset size 23898
adding up...


In [50]:
shuffled_doc_words[0]

'grand central oyster bar opening 100 capacity monday chef sandy ingber joins daily briefing promote vaccination'

In [51]:
with open('./cleaned_data2/' + dataset + '/' + dataset + '_shuffle.txt', 'w', encoding="utf8") as fp:
    fp.write(shuffled_doc_names_str)

with open('./cleaned_data2/' + dataset + '/corpus/' + dataset + '_shuffle.txt', 'w', encoding="utf8") as fp:
    fp.write(shuffled_doc_words_str)


In [52]:
# building  vocab

print("building vocab...")
word_freq = {}
wordset = set()
for doc_words in shuffled_doc_words:
    words = doc_words.split()
    for w in words:
        wordset.add(w)
        if w in word_freq:
            word_freq[w] += 1
        else:
            word_freq[w] = 1

vocab = list(wordset)
vocab_size = len(vocab)

building vocab...


In [53]:
#document having all words
word_doc_ls = {}

for i in range(len(shuffled_doc_words)):
    doc_words = shuffled_doc_words[i]
    words = doc_words.split()
    appearance = set()
    for w in words:
        if w in appearance:
            continue
        if w in word_doc_ls:
            doc_list = word_doc_ls[w]
            doc_list.append(i)
            word_doc_ls[w] = doc_list
        else:
            word_doc_ls[w] = [i]
        appearance.add(w)

In [54]:
print(appearance)

{'cross', 'mean', 'tough', 'vaccine', 'canada', 'likely', 'next', 'play', 'border', 'impossible', 'without', 'gonna', 'night'}


In [55]:
#vocab with frequency
worddoc_frequency = {}
for w, doc_list in word_doc_ls.items():
    worddoc_frequency[w] = len(doc_list)

word_ids = {}
for i in range(vocab_size):
    word_ids[vocab[i]] = i

vocab_str = '\n'.join(vocab)

with open('./cleaned_data2/' + dataset + '/corpus/' + dataset + '_vocab.txt', 'w') as fp:
    fp.write(vocab_str)


In [56]:
# labels list
labels_set = set()
for doc_met in shuffled_doc_names:
    col = doc_met.split(',')
    labels_set.add(col[2])
labels_list = list(labels_set)

labels_list_str = '\n'.join(labels_list)
with open('./cleaned_data2/' + dataset + '/corpus/' + dataset + '_labels.txt', 'w') as fp:
    fp.write(labels_list_str)


In [57]:
labels_list_str

'0\n1\n2'

In [58]:
#train
train_size = len(train_ids)
valid_size = int(0.1 * train_size)
actual_train_size = train_size - valid_size 


actual_train_doc_names = shuffled_doc_names[:actual_train_size]
actual_train_doc_names_str = '\n'.join(actual_train_doc_names)

with open('./cleaned_data2/' + dataset + '/graph/' + dataset + '.real_train.name', 'w',encoding='utf-8') as fp:
    fp.write(actual_train_doc_names_str)


In [59]:
#separating features data
rows_x = []
cols_x = []
data_x = []
for i in range(actual_train_size):
    doc_vector = np.array([0.0 for k in range(word_embeds_dim)])
    doc_words = shuffled_doc_words[i]
    words = doc_words.split()
    doc_length = len(words)
    for w in words:
        if w in word2vec_map:
            word_vector = word2vec_map[w]
            doc_vector = doc_vector + np.array(word_vector)

    for j in range(word_embeds_dim):
        rows_x.append(i)
        cols_x.append(j)
        data_x.append(doc_vector[j] / doc_length)




In [60]:
#separating data into x and y for train data
X = sp.csr_matrix((data_x, (rows_x, cols_x)), shape=(actual_train_size, word_embeds_dim))
y = []
for i in range(actual_train_size):
    doc_met = shuffled_doc_names[i]
    cols = doc_met.split(',')
    label = cols[2]
    one_hot = [0 for lab in range(len(labels_list))]
    label_ind = labels_list.index(label)
    one_hot[label_ind] = 1
    y.append(one_hot)
y = np.array(y)

In [61]:
# feature vectors of test document, no initial features
test_size = len(test_ids)

rows_tx = []
cols_tx = []
data_tx = []
for i in range(test_size):
    doc_vector = np.array([0.0 for k in range(word_embeds_dim)])
    doc_words = shuffled_doc_words[i + train_size]
    words = doc_words.split()
    doc_length = len(words)
    for w in words:
        if w in word2vec_map:
            word_vector = word2vec_map[w]
            doc_vector = doc_vector + np.array(word_vector)

    for j in range(word_embeds_dim):
        rows_tx.append(i)
        cols_tx.append(j)
       
        data_tx.append(doc_vector[j] / doc_length)  




In [62]:
#separating data into x and y for test data
tx = sp.csr_matrix((data_tx, (rows_tx, cols_tx)),shape=(test_size, word_embeds_dim))
ty = []
for i in range(test_size):
    doc_met = shuffled_doc_names[i + train_size]
    cols = doc_met.split(',')
    label = cols[2]
    one_hot = [0 for lab in range(len(labels_list))]
    label_ind = labels_list.index(label)
    one_hot[label_ind] = 1
    ty.append(one_hot)
ty = np.array(ty)

In [63]:
#all x data  and all y data
word_vecs = np.random.uniform(-0.01, 0.01,(vocab_size, word_embeds_dim))

for i in range(len(vocab)):
    w = vocab[i]
    if w in word2vec_map:
        vector = word2vec_map[w]
        word_vecs[i] = vector

row_all_x = []
col_all_x = []
data_all_x = []

for i in range(train_size):
    doc_vector = np.array([0.0 for k in range(word_embeds_dim)])
    doc_words = shuffled_doc_words[i]
    words = doc_words.split()
    doc_length = len(words)
    for w in words:
        if w in word2vec_map:
            word_vec = word2vec_map[w]
            doc_vector = doc_vector + np.array(word_vec)

    for j in range(word_embeds_dim):
        row_all_x.append(int(i))
        col_all_x.append(j)
       
        data_all_x.append(doc_vector[j] / doc_length) 
for i in range(vocab_size):
    for j in range(word_embeds_dim):
        row_all_x.append(int(i + train_size))
        col_all_x.append(j)
        data_all_x.append(word_vecs.item((i, j)))


row_all_x = np.array(row_all_x)
col_all_x = np.array(col_all_x)
data_all_x = np.array(data_all_x)

all_x = sp.csr_matrix((data_all_x, (row_all_x, col_all_x)), shape=(train_size + vocab_size, word_embeds_dim))

all_y = []
for i in range(train_size):
    doc_met = shuffled_doc_names[i]
    cols = doc_met.split(',')
    label = cols[2]
    one_hot = [0 for l in range(len(labels_list))]
    label_ind = labels_list.index(label)
    one_hot[label_ind] = 1
    all_y.append(one_hot)

for i in range(vocab_size):
    one_hot = [0 for l in range(len(labels_list))]
    all_y.append(one_hot)

all_y = np.array(all_y)

print(X.shape, y.shape, tx.shape, ty.shape, all_x.shape, all_y.shape)



(17189, 300) (17189, 3) (4800, 300) (4800, 3) (25298, 300) (25298, 3)


In [64]:
# words co-occurence with the context
win_size = 20
win = []

for doc_words in shuffled_doc_words:
    words = doc_words.split()
    length = len(words)
    if length <= win_size:
        win.append(words)
    else:
        for j in range(length - win_size + 1):
            wind = words[j: j + win_size]
            win.append(wind)
        

In [65]:
win[0]

['grand',
 'central',
 'oyster',
 'bar',
 'opening',
 '100',
 'capacity',
 'monday',
 'chef',
 'sandy',
 'ingber',
 'joins',
 'daily',
 'briefing',
 'promote',
 'vaccination']

In [66]:
#word window frequency
word_win_freq = {}
for win_ in win:
    appearance = set()
    for i in range(len(win_)):
        if win_[i] in appearance:
            continue
        if win_[i] in word_win_freq:
            word_win_freq[win_[i]] += 1
        else:
            word_win_freq[win_[i]] = 1
        appearance.add(win_[i])

        

In [67]:
#pairs count
word_pairs_count = {}
for win_ in win:
    for i in range(1, len(win_)):
        for j in range(0, i):
            word_i = win_[i]
            word_i_id = word_ids[word_i]
            word_j = win_[j]
            word_j_id = word_ids[word_j]
            if word_i_id == word_j_id:
                continue
            word_pairs_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pairs_str in word_pairs_count:
                word_pairs_count[word_pairs_str] += 1
            else:
                word_pairs_count[word_pairs_str] = 1
            # two orders
            word_pairs_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pairs_str in word_pairs_count:
                word_pairs_count[word_pairs_str] += 1
            else:
                word_pairs_count[word_pairs_str] = 1

In [68]:
row = []
col = []
weight = []

# pmi as weights

num_window = len(win)

for key in word_pairs_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    count = word_pairs_count[key]
    word_freq_i = word_win_freq[vocab[i]]
    word_freq_j = word_win_freq[vocab[j]]
    pmi = log((1.0 * count / num_window) / (1.0 * word_freq_i * word_freq_j/(num_window * num_window)))
    if pmi <= 0:
        continue
    row.append(train_size + i)
    col.append(train_size + j)
    weight.append(pmi)


# docs-word frequency
docs_word_freq = {}

for id in range(len(shuffled_doc_words)):
    doc_words = shuffled_doc_words[id]
    words = doc_words.split()
    for w in words:
        word_id = word_ids[w]
        doc_word_str = str(id) + ',' + str(word_id)
        if doc_word_str in docs_word_freq:
            docs_word_freq[doc_word_str] += 1
        else:
            docs_word_freq[doc_word_str] = 1

for i in range(len(shuffled_doc_words)):
    doc_words = shuffled_doc_words[i]
    words = doc_words.split()
    doc_word_set = set()
    for w in words:
        if w in doc_word_set:
            continue
        j = word_ids[w]
        key = str(i) + ',' + str(j)
        freq = docs_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_size)
        col.append(train_size + j)
        idf = log(1.0 * len(shuffled_doc_words) /
                  worddoc_frequency[vocab[j]])
        weight.append(freq * idf)
        doc_word_set.add(w)

node_size = train_size + vocab_size + test_size
adj = sp.csr_matrix(
    (weight, (row, col)), shape=(node_size, nod_size))


In [69]:
# dump objects
with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.x', 'wb') as fp:
    pkl.dump(X, fp)

with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.y', 'wb') as fp:
    pkl.dump(y, fp)

with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.tx', 'wb') as fp:
    pkl.dump(tx, fp)

with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.ty', 'wb') as fp:
    pkl.dump(ty, fp)

with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.allx', 'wb') as fp:
    pkl.dump(all_x, fp)

with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.ally', 'wb') as fp:
    pkl.dump(all_y, fp)

with open('./cleaned_data2/' + dataset + '/graph/ind.' + dataset + '.adj', 'wb') as fp:
    pkl.dump(adj, fp)
