In [1]:
import numpy as np
from gensim.models import KeyedVectors
import re
import xml.etree.ElementTree as ET

In [2]:
sentence_length = 100
vector_size = 300
train_path = 'v3.2/train/'
train_fileName = 'SemEval2016-Task3-CQA-QL-train-part1-subtaskA.xml'
test_path = 'v3.2/test/'
test_fileName = 'SemEval2016-Task3-CQA-QL-test-subtaskA.xml'
word2vec_matrix = 'GoogleNews-vectors-negative300.bin'
cQA_train_embedding_name = 'cQA_train_embedding'
cQA_test_embedding_name = 'cQA_test_embedding_id'

In [3]:
class rawDataExtractor:
    def __init__(self, path, fileName):
        self.tree = ET.parse(path + fileName)
        self.root = self.tree.getroot()
        
    def extractInformation_QA(self, test = False, testSize = 5):
        '''
        This function returns a python dictionary which has the following structure:
        infoDic = {
            'Q1_R1': {
                'qTime': unixtime
                'qSubject': string
                'qUserID': string
                'qUserID_INT': int
                'qBody': string
                'comments': {
                    'cID': {
                        'cTime': unixtime
                        'cUserID': string
                        'cUserID_INT': int
                        'cBody': string
                        'cLabel': string  
                    }  
                    ...
                }
            }
            ...
        }
        '''
        infoDic = {}
        # For testing ONLY:
        count = 0
        for child in self.root:
            # For testing ONLY:
            if test:
                count += 1
                if count > testSize:
                    return infoDic
            
            # Get the question key
            currentQuestionKey = child.attrib.get('THREAD_SEQUENCE')
            
            # Get the question dictionary
            currentQuestion = self.extractSingleInformation(child)
            if not infoDic.get(currentQuestionKey):
                infoDic[currentQuestionKey] = currentQuestion
            else:
                print('%s key has already existed. Info extraction failed...')
                return None
        return infoDic
    
    # Use this function when loading 'SemEval2016-Task3-CQA-QL-train-part1-subtaskA.xml' ONLY!
    def extractSingleInformation(self, child):
        singleInfoDic = {}
        singleInfoDic['comments'] = {}
        for index in range(len(child)):    
            # Question
            element = child[index]
            if index == 0 and element.attrib.get('RELQ_ID'):
                singleInfoDic['qTime'] = element.attrib.get('RELQ_DATE')
#                 singleInfoDic['qID'] = element.attrib.get('RELQ_ID')
                singleInfoDic['qUserID'] = element.attrib.get('RELQ_USERID')
                sUserID_int = int(singleInfoDic['qUserID'].replace('U', ''))
                singleInfoDic['qUserID_INT'] = sUserID_int
            
                singleInfoDic['qSubject'] = element[0].text
                singleInfoDic['qBody'] = element[1].text   
            else:
                commentKey = element.attrib.get('RELC_ID') #cID
                singleInfoDic['comments'][commentKey] = {}
                singleInfoDic['comments'][commentKey]['cTime'] = element.attrib.get('RELC_DATE')
                singleInfoDic['comments'][commentKey]['cUserID'] = element.attrib.get('RELC_USERID')
                cUserID_int = int(singleInfoDic['comments'][commentKey]['cUserID'].replace('U', ''))
                singleInfoDic['comments'][commentKey]['cUserID_INT'] = cUserID_int
                singleInfoDic['comments'][commentKey]['cBody'] = element[0].text
                
                label = element.attrib.get('RELC_RELEVANCE2RELQ')
                singleInfoDic['comments'][commentKey]['cLabel'] = label
                
                if label == 'Good':
                    singleInfoDic['comments'][commentKey]['cLabel_INT'] = 2
                elif label == 'Bad':
                    singleInfoDic['comments'][commentKey]['cLabel_INT'] = 0
                else:
                    singleInfoDic['comments'][commentKey]['cLabel_INT'] = 1
        return singleInfoDic
    
    # Use this function when loading 'SemEval2016-Task3-CQA-QL-train-part1.xml' ONLY!
    def extractInformation_QQ(self):
        '''
        This function returns a python dictionary which has the following structure:
        infoDic = {
            'Q1': {
                'qTargetSubject': string
                'qTargetBody': string
                'availableQs': {
                    'qTime': unixtime
                    'qSubject': string
                    'qUserID': string
                    'qUserID_INT': int
                    'qBody': string
                    'qLabel': string
                    'qLabel_INT': int   
                }
                ...
            }
            ...
        }
        '''
        
        
        
        return {}

In [11]:
class Text2Vec:
    def __init__(self, word2vec_dir, vector_dim, sentence_length):
        self.model = KeyedVectors.load_word2vec_format(word2vec_dir, binary=True)
        self.vector_dim = vector_dim
        self.sentence_length = sentence_length
        self.pattern = re.compile(r"[^\w]")
        replace_op = lambda x: self.pattern.sub('', x)
        self.ops = [lambda x: x, lambda x: x.lower(), lambda x: x.capitalize(), lambda x: x.upper(), \
                   lambda x: replace_op(x), lambda x: replace_op(x).lower(), \
                    lambda x: replace_op(x).capitalize(), lambda x: replace_op(x).upper()]
        
    def embed_sentence(self, sentence):
        words = sentence.strip().split()
        vectors = []
        for w in words[:self.sentence_length]:
            for op in self.ops:
                new_w = op(w)
                if new_w in self.model.vocab:
                    vectors.append(self.model[new_w].reshape((1, -1)))
                    break
            else:
                vectors.append(np.random.uniform(low=-0.25, high=0.25, size=(1, self.vector_dim)))
        if len(vectors) < self.sentence_length:
            vectors.append(np.zeros((self.sentence_length - len(vectors), self.vector_dim)))
        return np.concatenate(vectors, axis=0).reshape(1, self.sentence_length, self.vector_dim)
    
    def build_matrix(self, raw_dict, save_dir):
        q_vectors = []
        a_vectors = []
        labels = []
        aug_data = []
        cid_list = []
        for thread in raw_dict.values():
            q_vector = self.embed_sentence((thread['qSubject'] if thread['qSubject'] else '') + \
                                           ' ' + thread['qBody'] if thread['qBody'] else '')
            q_id = thread['qUserID_INT']
            for cid, comment in thread['comments'].items():
                q_vectors.append(q_vector)
                a_vectors.append(self.embed_sentence(comment['cBody'] if comment['cBody'] else ''))
                labels.append(np.array([[comment['cLabel_INT'] / 2]]))
                aug_data.append(np.array([[0.0 if q_id != comment['cUserID_INT'] else 1.0]]))
                cid_list.append([[cid]])
        q_vec = np.concatenate(q_vectors, axis=0)
        a_vec = np.concatenate(a_vectors, axis=0)
        label_vec = np.concatenate(labels, axis=0)
        aug_vec = np.concatenate(aug_data, axis=0)
        cid_vec = np.concatenate(cid_list, axis=0)
        np.savez(save_dir, q_vec, a_vec, aug_vec, label_vec, cid_vec)

In [12]:
t2v = Text2Vec(word2vec_matrix, vector_size, sentence_length)

In [6]:
rDE = rawDataExtractor(train_path, train_fileName)
raw_dict = rDE.extractInformation_QA(test = False, testSize = 50)
t2v.build_matrix(raw_dict, cQA_train_embedding_name)

In [13]:
rDE = rawDataExtractor(test_path, test_fileName)
raw_dict = rDE.extractInformation_QA(test = False, testSize = 50)
t2v.build_matrix(raw_dict, cQA_test_embedding_name)

In [46]:
np.linalg.norm(t2v.model['grab'])

2.6442745

In [4]:
a = [['a'], ['b']]

In [10]:
for x in np.concatenate(a, axis=0):
    print(str(x))

a
b
