# Download Dataset


In [1]:
# ALL_OUTPUT =   a list of input sentences
# ALL_INPUT =  a list of output labels

# Hyper Parameters

In [2]:
WORD_EMBEDDING = 0     # 0=word2vec, 1=fasttext, 2=glove
DIM = 25               # number of streams
D2D_THRESHOLD = 15
POOLING = "avg"        # "max","min","avg"

ALL_USED = True
USED_SIZE = 450
TRAIN_PORTION = 0.75

HIDDEN_DIM = 20
DROP_OUT = 0.5
LR = 0.001
WEIGHT_DECAY =  0
EPOCH = 2000
EARLY_STOPPING = 100

VAL_PORTION = 0.1
REMOVE_LESS_FREQUENT = 5
NUM_TEST = 5

# Libraries

In [3]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import time

import math
from math import log
import scipy.sparse as sp

import nltk
from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models import FastText
# from glove import Corpus, Glove
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix


import torch
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim




In [4]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
ALL_OUTPUT =  list(df['sentiment'][-500:])
ALL_INPUT =  list(df['review'][-500:])

for i in range(len(ALL_INPUT)):
    temp = ALL_INPUT[i].split(' ')
    temp = temp[-500:]
    ALL_INPUT[i] = str(' '.join(temp))

In [5]:
count = 0
for i in ALL_OUTPUT:
    if i == 0:
        count+=1
count

0

# Preprocess

## Train Test Split

In [6]:
if ALL_USED:
    sent_used, label_used = ALL_INPUT, ALL_OUTPUT
else:
    sent_used,_, label_used, _ = train_test_split(ALL_INPUT ,ALL_OUTPUT, train_size = USED_SIZE, stratify = ALL_OUTPUT, random_state = 0 )

not_all = False
try:
    train_sent, test_sent, train_labels, test_labels = train_test_split(sent_used ,label_used, stratify = label_used, train_size = TRAIN_PORTION, random_state = 0 ) 
except:
    train_sent, test_sent, train_labels, test_labels = train_test_split(sent_used ,label_used, train_size = TRAIN_PORTION, random_state = 0 ) 

unique_train = np.unique(train_labels)
unique_test = np.unique(test_labels)
for label in unique_test:
    if label not in unique_train:
        not_all = True        
        break

if not_all:
    labels_to_add = [label for label in unique_test if label not in unique_train]
    label_add_set = set(labels_to_add)
    i = 0
    while len(label_add_set)>0:
        label = test_labels[i]
        if label in label_add_set:
            train_sent.append(test_sent[i])
            train_labels.append(test_labels[i])
            test_sent = test_sent[:i]+test_sent[i+1:]
            test_labels = test_labels[:i]+test_labels[i+1:]
            label_add_set.remove(label)
        else:
            i += 1

original_sentences = train_sent+test_sent
train_size = len(train_sent)
test_size = len(test_sent)

In [7]:
test_size

125

## Label Encoding

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

unique_labels=np.unique(train_labels + test_labels)

num_class = len(unique_labels)
lEnc = LabelEncoder()
lEnc.fit(unique_labels)

print(unique_labels)
print(lEnc.transform(unique_labels))

train_labels = lEnc.transform(train_labels)
test_labels = lEnc.transform(test_labels)
labels = train_labels.tolist()+test_labels.tolist()
labels = torch.LongTensor(labels).to(device)

['negative' 'positive']
[0 1]


## Remove Stopwords and less frequent words, tokenize sentences

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

original_word_freq = {}  # to remove rare words
for sentence in original_sentences:
    temp = clean_str(sentence)
    word_list = temp.split()
    for word in word_list:
        if word in original_word_freq:
            original_word_freq[word] += 1
        else:
            original_word_freq[word] = 1   

tokenize_sentences = []
word_list_dict = {}
for sentence in original_sentences:
    temp = clean_str(sentence)
    word_list_temp = temp.split()
    doc_words = []
    for word in word_list_temp:
        if word not in stop_words and original_word_freq[word] >= REMOVE_LESS_FREQUENT:
            doc_words.append(word)
            word_list_dict[word] = 1
    tokenize_sentences.append(doc_words)
word_list = list(word_list_dict.keys())
vocab_length = len(word_list)

del original_sentences

#word to id dict
word_id_map = {}
for i in range(vocab_length):
    word_id_map[word_list[i]] = i           

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## W2V

In [10]:
if WORD_EMBEDDING == 0:
    wv_cbow_model = Word2Vec(sentences=tokenize_sentences, vector_size=DIM, window=5, min_count=0, workers=4, sg=0, epochs=EPOCH)
    wv_cbow_model.init_sims(replace=True)
    word_emb_dict = {}
    for word in word_list:
        word_emb_dict[word] = wv_cbow_model.wv[word].tolist()
elif WORD_EMBEDDING == 1:
    ft_sg_model = FastText(sentences=tokenize_sentences, size=DIM, window=5, min_count=0, workers=4, sg=0, iter = 200)
    word_emb_dict = {}
    for word in word_list:
        word_emb_dict[word] = ft_sg_model[word].tolist()

  wv_cbow_model.init_sims(replace=True)


In [11]:
word_emb_dict

{'let': [-0.21384340524673462,
  0.025784242898225784,
  0.007046420127153397,
  -0.00467605609446764,
  0.07469264417886734,
  0.022782932966947556,
  0.10988757014274597,
  0.137385293841362,
  0.13459065556526184,
  0.21538712084293365,
  -0.04570802301168442,
  0.1277996301651001,
  -0.57844078540802,
  -0.07510299980640411,
  -0.19304654002189636,
  -0.18756157159805298,
  -0.20895113050937653,
  0.2935832142829895,
  0.08398018032312393,
  0.2476210594177246,
  0.23298968374729156,
  0.20984822511672974,
  -0.022613320499658585,
  -0.35121628642082214,
  -0.021739674732089043],
 'start': [0.18364958465099335,
  -0.24802599847316742,
  -0.23474670946598053,
  -0.03324851766228676,
  0.08811414241790771,
  0.19036507606506348,
  0.11004719138145447,
  -0.28720155358314514,
  -0.025848228484392166,
  -0.16305167973041534,
  0.5338538885116577,
  -0.20437388122081757,
  -0.14590030908584595,
  0.0008711718837730587,
  0.2512868642807007,
  -0.3746936619281769,
  0.08134956657886505,


In [12]:
dist = np.linalg.norm(np.array(word_emb_dict['let']) - np.array(word_emb_dict['like']))
dist

1.506630859999742

## Doc2vec

In [13]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenize_sentences)]
documents
model = Doc2Vec(documents, vector_size=DIM, window=5, min_count=1, workers=4, epochs=EPOCH)
model.init_sims(replace=True)
doc2vec_emb = []
for i in range(len(documents)):
    doc2vec_emb.append(model.dv[i])
doc2vec_npy = np.array(doc2vec_emb)

  model.init_sims(replace=True)


In [14]:
doc2vec_npy[1]

array([-0.1563479 , -0.31875283, -0.17487234,  0.16266213, -0.19836186,
        0.3536016 ,  0.1999544 ,  0.12994367, -0.18906046, -0.23036128,
        0.266438  ,  0.20641378, -0.1669882 , -0.03160469,  0.26619923,
       -0.12125276,  0.37587994, -0.00697309, -0.06829749,  0.10742239,
        0.02820021,  0.10289482,  0.22118646,  0.06633034, -0.23783523],
      dtype=float32)

In [None]:
from transformers import BertModel, BertTokenizer
import torch
model_name = 'bert-base-uncased'

model = BertModel.from_pretrained(model_name, output_hidden_states = True)                          
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
import string
from sklearn.decomposition import PCA
table = str.maketrans('', '', string.punctuation)
sentences = []
for i in range(len(tokenize_sentences)):
  tokenize_sentences[i] = [w.translate(table) for w in tokenize_sentences[i]]
  sentences.append(' '.join(tokenize_sentences[i][-500:]))
pca = PCA(n_components=DIM)

In [None]:
embedd = []
for sent in sentences:
    model = BertModel.from_pretrained(model_name, output_hidden_states = True)
    tokenizer = BertTokenizer.from_pretrained(model_name)
    encoded_dict = tokenizer.encode_plus(sent, 
                                       add_special_tokens=True, 
                                       max_length=512, 
                                       pad_to_max_length=True, 
                                       return_attention_mask=True,
                                       return_tensors ='pt',
                                       truncation=True)
    input_ids= encoded_dict['input_ids']
    attention_masks=encoded_dict['attention_mask']
    output = model(input_ids, attention_mask=attention_masks)
    
    last_hidden = output.last_hidden_state
    cls_ = last_hidden[ : ,0, : ]
    cls_ = torch.flatten(cls_)
    embedd+= [cls_]
    
    del cls_, output,attention_masks, input_ids, last_hidden, encoded_dict, model, tokenizer
    

# Graph

In [15]:


node_size = train_size + vocab_length + test_size
adj_tensor = []

In [16]:
node_size

2681

## d2w: tfidf

In [17]:
tfidf_row = []
tfidf_col = []
tfidf_weight = []

#get each word appears in which document
word_doc_list = {}
for word in word_list:
    word_doc_list[word]=[]

for i in range(len(tokenize_sentences)):
    doc_words = tokenize_sentences[i]
    unique_words = set(doc_words)
    for word in unique_words:
        exsit_list = word_doc_list[word]
        exsit_list.append(i)
        word_doc_list[word] = exsit_list

#document frequency
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

# term frequency
doc_word_freq = {}

for doc_id in range(len(tokenize_sentences)):
    words = tokenize_sentences[doc_id]
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

for i in range(len(tokenize_sentences)):
    words = tokenize_sentences[i]
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row_tmp = i
        else:
            row_tmp = i + vocab_length
        col_tmp = train_size + j
        
        idf = log(1.0 * len(tokenize_sentences) / word_doc_freq[word_list[j]])
        weight_tmp = freq * idf
        doc_word_set.add(word)

        tfidf_row.append(row_tmp)
        tfidf_col.append(col_tmp)
        tfidf_weight.append(weight_tmp)

        tfidf_row.append(col_tmp)
        tfidf_col.append(row_tmp)
        tfidf_weight.append(weight_tmp)

## Diagonal

In [18]:
for i in range(node_size):
    tfidf_row.append(i)
    tfidf_col.append(i)
    tfidf_weight.append(1)

## w2w and d2d

In [19]:
def ordered_word_pair(a, b):
  if a > b:
    return (b, a)
  else:
    return (a, b)

co_dict = {}
for sent in tokenize_sentences:
    for i,word1 in enumerate(sent):
        for word2 in sent[i:]:
            co_dict[ordered_word_pair(word_id_map[word1],word_id_map[word2])] = 1

In [20]:
co_occur_threshold = D2D_THRESHOLD

doc_vec_bow = []
for sent in tokenize_sentences:
    temp = np.zeros((vocab_length))
    for word in sent:
        temp[word_id_map[word]] = 1
    doc_vec_bow.append(temp)

co_doc_dict = {}
for i in range(len(doc_vec_bow)-1):
    for j in range(i+1,len(doc_vec_bow)):
        if np.dot(doc_vec_bow[i],doc_vec_bow[j]) >= co_occur_threshold:
            co_doc_dict[(i,j)] = 1

In [21]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape).to(device)

In [22]:
a=[1,2]
a.append([3,4])
a

[1, 2, [3, 4]]

In [23]:
adj_list = []

col = tfidf_col[:]
row = tfidf_row[:]
weight = tfidf_weight[:]
for pair in co_dict:
    ind1, ind2 = pair

    word1 = word_list[ind1]
    word2 = word_list[ind2]
    
    tmp = np.linalg.norm(np.array(word_emb_dict[word1]) - np.array(word_emb_dict[word2]))

    row.append(ind2+train_size)
    col.append(ind1+train_size)
    weight.append(tmp)

    row.append(ind1+train_size)
    col.append(ind2+train_size)
    weight.append(tmp)

for pair in co_doc_dict:
    ind1, ind2 = pair
    
    tmp = np.linalg.norm(np.array(doc2vec_npy[ind1]) - np.array(doc2vec_npy[ind2]))

    if ind1>train_size:
        ind1 += vocab_length
    if ind2>train_size:    
        ind2 += vocab_length

    row.append(ind2)
    col.append(ind1)
    weight.append(tmp)

    row.append(ind1)
    col.append(ind2)
    weight.append(tmp)    

    
adj_tmp = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))
adj_tmp = adj_tmp + adj_tmp.T.multiply(adj_tmp.T > adj_tmp) - adj_tmp.multiply(adj_tmp.T > adj_tmp)
adj_tmp = normalize_adj(adj_tmp) 
adj_tmp = sparse_mx_to_torch_sparse_tensor(adj_tmp)



In [24]:
adj = adj_tmp

# Model - MULTIGCN

## input features - glove and doc2vec

In [25]:
features = np.arange(node_size)
features = torch.FloatTensor(features).to(device)

In [26]:
print(features.shape)

torch.Size([2681])


## GCN layer

In [27]:
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features,  drop_out = 0, activation=None, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.zeros(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters(in_features, out_features)
        self.dropout = torch.nn.Dropout(drop_out)
        self.activation =  activation

    def reset_parameters(self,in_features, out_features):
        stdv = np.sqrt(6.0/(in_features+out_features))
        # stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        # if self.bias is not None:
        #     torch.nn.init.zeros_(self.bias)
            # self.bias.data.uniform_(-stdv, stdv)


    def forward(self, input, adj, feature_less=False):
        if feature_less:
            support = self.weight
        else:
            input = self.dropout(input)
            support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            output = output + self.bias
#         print('before')
#         print(output[0])
        if self.activation is not None:
            output = self.activation(output)
#         print('after')
#         print(output[0])
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

## Main Model

In [28]:
class MULTIGCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(MULTIGCN, self).__init__()

        # different weights
        self.intras1 = nn.ModuleList([GraphConvolution(nfeat, nhid, dropout, activation = nn.ReLU()) for i in range(DIM)])
        self.intras2 = nn.ModuleList([GraphConvolution(nhid*DIM, nclass, dropout, activation = nn.ReLU()) for i in range(DIM)])


    def forward(self, x, adj, feature_less=False):
        x = torch.stack([self.intras1[i](x,adj[i],feature_less) for i in range(DIM)]) 
        x = x.permute(1,0,2) 
        x = x.reshape(x.size()[0],-1) 
        x = torch.stack([self.intras2[i](x,adj[i]) for i in range(DIM)]) 

        if POOLING == 'avg':
            return torch.mean(x,0)
        if POOLING == 'max':
            return torch.max(x,0)[0]
        if POOLING == 'min':
            return torch.min(x,0)[0] 

## Training

In [29]:
real_train_size = int((1-VAL_PORTION)*train_size)
val_size = train_size-real_train_size

idx_train = range(real_train_size)
idx_val = range(real_train_size,train_size)
idx_test = range(train_size + vocab_length,node_size)

In [30]:
m = nn.Softmax(dim=0)
input = torch.randn(2, 3)
print(input)
output = m(input)
print(output)

tensor([[ 1.4331,  0.2615,  1.8684],
        [-1.6489, -1.1439, -0.5661]])
tensor([[0.9561, 0.8030, 0.9194],
        [0.0439, 0.1970, 0.0806]])


In [44]:
# Model and optimizer

def cal_accuracy(predictions,labels):
    pred = torch.argmax(predictions,-1).cpu().tolist()
    lab = labels.cpu().tolist()
    cor = 0
    for i in range(len(pred)):
        if pred[i] == lab[i]:
            cor += 1
    return cor/len(pred)


final_acc_list = []
for _ in range(1):
    model = MULTIGCN(nfeat=node_size, nhid=HIDDEN_DIM, nclass=num_class, dropout=DROP_OUT).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=WEIGHT_DECAY)
    criterion = nn.CrossEntropyLoss()
    val_loss = []
    for epoch in range(100):

        t = time.time()
        model.train()
        optimizer.zero_grad()
        output = model(features, adj)
        loss_train = criterion(output[idx_train], labels[idx_train])
        acc_train = cal_accuracy(output[idx_train], labels[idx_train])
        loss_train.backward()
        optimizer.step()


        model.eval()
        output = model(features, adj)
        loss_val = criterion(output[idx_val], labels[idx_val])
        val_loss.append(loss_val.item())
        acc_val = cal_accuracy(output[idx_val], labels[idx_val])
        print(  'Epoch: {:04d}'.format(epoch+1),
                'loss_train: {:.4f}'.format(loss_train.item()),
                'acc_train: {:.4f}'.format(acc_train),
                'loss_val: {:.4f}'.format(loss_val.item()),
                'acc_val: {:.4f}'.format(acc_val),
                'time: {:.4f}s'.format(time.time() - t))
        
        if epoch > 1000 and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
            print("Early Stopping...")
            break

    model.eval()
    output = model(features, adj)

    loss_test = criterion(output[idx_test], labels[-test_size:])
    acc_test = cal_accuracy(output[idx_test], labels[-test_size:])
    print("Test set results:",
            "loss= {:.4f}".format(loss_test.item()),
            "accuracy= {:.4f}".format((acc_test+acc_train)/2))

    final_acc_list.append(acc_test)

Epoch: 0001 loss_train: 0.6931 acc_train: 0.4926 loss_val: 0.6936 acc_val: 0.4737 time: 0.2756s
Epoch: 0002 loss_train: 0.6925 acc_train: 0.5015 loss_val: 0.6942 acc_val: 0.5000 time: 0.2753s
Epoch: 0003 loss_train: 0.6919 acc_train: 0.4896 loss_val: 0.6947 acc_val: 0.5000 time: 0.2813s
Epoch: 0004 loss_train: 0.6914 acc_train: 0.4955 loss_val: 0.6953 acc_val: 0.5263 time: 0.2697s
Epoch: 0005 loss_train: 0.6906 acc_train: 0.5223 loss_val: 0.6958 acc_val: 0.5000 time: 0.2706s
Epoch: 0006 loss_train: 0.6902 acc_train: 0.5341 loss_val: 0.6963 acc_val: 0.4474 time: 0.3362s
Epoch: 0007 loss_train: 0.6894 acc_train: 0.5519 loss_val: 0.6968 acc_val: 0.4474 time: 0.3377s
Epoch: 0008 loss_train: 0.6888 acc_train: 0.5519 loss_val: 0.6972 acc_val: 0.4211 time: 0.3004s
Epoch: 0009 loss_train: 0.6883 acc_train: 0.5519 loss_val: 0.6975 acc_val: 0.5263 time: 0.2836s
Epoch: 0010 loss_train: 0.6877 acc_train: 0.5549 loss_val: 0.6978 acc_val: 0.5000 time: 0.2753s
Epoch: 0011 loss_train: 0.6864 acc_train

In [32]:
input_prob = torch.randn(4, requires_grad=True)
target_prob = torch.empty(4).random_(3)

In [33]:
x = nn.Sigmoid()
x(input_prob)

tensor([0.2916, 0.3992, 0.6578, 0.6469], grad_fn=<SigmoidBackward0>)

In [34]:
target_prob

tensor([1., 2., 2., 2.])

In [35]:
l(torch.LongTensor([1,0,1]))

NameError: name 'l' is not defined

In [None]:
torch.randn(3, 5).softmax(dim=1)


In [None]:
softmax([0,1])