In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import math

from random import random
import sys
import pickle
import argparse

device = torch.device(0 if torch.cuda.is_available() else "cpu")

In [None]:
# read from train/test data files and return the tuple as (label, original_sent, candsent, trendid)
def readInData(filename):

    data = []
    trends = set([])
    
    (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = (None, None, None, None, None, None, None)
    
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = line.split('\t')
        #read in test data without labels
        elif len(line.split('\t')) == 6:
            (trendid, trendname, origsent, candsent, origsenttag, candsenttag) = line.split('\t')
        else:
            continue
        
        #if origsent == candsent:
        #    continue
        
        trends.add(trendid)
        
        if judge == None:
            data.append((judge, origsent, candsent, trendid))
            continue

        # ignoring the training/test data that has middle label 
        if judge[0] == '(':  # labelled by Amazon Mechanical Turk in format like "(2,3)"
            nYes = eval(judge)[0]
            data.append((nYes/5, origsent, candsent, trendid))
        elif judge[0].isdigit():   # labelled by expert in format like "2"
            nYes = int(judge[0])
            data.append((nYes/5, origsent, candsent, trendid))   
                
    return data, trends


In [None]:
def generate_dict(embedding_path, d_model):
    d = {}
    embedding_list = []
    with open(embedding_path, 'r', encoding='utf-8') as f:
        line = f.readline()
        idx = 1
        while line:
            try:
                k = line.split()
                a = [float(w) for w in k[1:]]
                if (len(a)==d_model):
                    d[k[0].lower()] = idx
                    idx += 1
                    embedding_list.append(a)
            except:
                pass
            line = f.readline()
    tmp = []
    for i in range(d_model):
        tmp.append(0)
    embedding_list = [tmp] + embedding_list
    embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_list), padding_idx=0)

    print('Reading embedding finished.')
        
    return d, embedding

In [None]:
def padding(x, max_len=10000):
#     max_len = 0
#     for xx in x:
#         if max_len < len(xx):
#             max_len = len(xx)
    for i in range(len(x)):
        xx = x[i]
        kk = len(xx)
        x[i] = xx + ([0] * (max_len - kk)) 
    return x

In [5]:
def get_index(d, sentence):
    s=sentence.strip().split()
    for i in range(len(s)):
        s[i]=s[i].lower()
        if s[i] in d.keys():
            s[i]=d[s[i]]
        else:
            s[i]=0
    return s

In [6]:
def preprocessing(embedding_path, input_path, testing=False, d_model=200, max_len=None):
    d, embedding = generate_dict(embedding_path, d_model)
    x0 = []
    x1 = []
    y = []
    trends, _ = readInData(input_path)

    for trend in trends:
        if testing:
            x0.append(get_index(d, trend[1]))
            x1.append(get_index(d, trend[2]))
            y.append(-1)
        else:
            x0.append(get_index(d, trend[1]))
            x1.append(get_index(d, trend[2]))
            y.append(trend[0])
    
    if max_len==None:
        max_len = 0
        for xx in x0 + x1:
            if max_len < len(xx):
                max_len = len(xx)
    print("max length is: ", max_len)
    embedding=embedding.to(device)
    x0 = embedding(torch.tensor(padding(x0, max_len=max_len)).to(device))    
    x1 = embedding(torch.tensor(padding(x1, max_len=max_len)).to(device))    

    return x0.cpu(), x1.cpu(), torch.tensor(y, dtype=torch.float), embedding.cpu()

In [11]:
MODEL_SAVE_PATH = '../tmp/attention_model'

# Data & embedding configerations
d_model = 50
PRE_TRAINED_EMBEDDING_PATH = '../embedding/glove.twitter.27B.'+str(d_model)+'d.txt'
DATA_PATH = '../data/train.data'
OUTPUT_PATH = '../data/train_data'

In [12]:
x0, x1, Y, emb = preprocessing(PRE_TRAINED_EMBEDDING_PATH, DATA_PATH, testing=False, d_model=d_model, max_len=18)

Reading embedding finished.
max length is:  18


In [13]:
print(x0.size())
print(x1.size())
print(Y.size())
Y=Y.long()
print(Y)

torch.Size([972, 18, 200])
torch.Size([972, 18, 200])
torch.Size([972])
tensor([3, 2, 2, 1, 1, 4, 1, 2, 3, 3, 2, 4, 1, 1, 3, 3, 4, 2, 1, 1, 1, 3, 2, 2,
        2, 3, 4, 4, 4, 2, 3, 2, 3, 4, 2, 3, 3, 1, 1, 3, 1, 1, 1, 3, 1, 3, 2, 4,
        3, 2, 5, 2, 5, 2, 3, 3, 1, 5, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 3, 2, 1, 1,
        1, 3, 1, 2, 3, 3, 1, 3, 2, 4, 3, 1, 1, 1, 2, 1, 3, 2, 1, 4, 1, 3, 1, 2,
        2, 2, 2, 4, 3, 5, 4, 1, 2, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 2, 1, 1, 1, 1, 1, 4, 2, 5, 1, 1, 4, 2, 1, 3, 1, 1, 1, 1, 2,
        5, 3, 2, 2, 4, 3, 5, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 4, 3, 4,
        3, 2, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 3, 3, 1, 3, 3, 3, 1, 1, 2, 1, 2, 3, 1, 1, 1, 2, 1,
        2, 2, 1, 1, 2, 4, 4, 4, 1, 1, 4, 1, 4, 4, 1, 4, 3, 5, 5, 3, 1, 1, 5, 3,
        3, 2, 2, 4, 4, 1, 5, 3, 5, 1, 3, 1, 1, 4

In [14]:
f=open(OUTPUT_PATH+"_"+str(d_model)+"d_reg.pkl", "wb")
pickle.dump(x0, f)
pickle.dump(x1, f)
pickle.dump(Y, f)
f.close()