# Data preprocessing for the Question Retrieval task

In [16]:
import numpy as np
from gensim.models import KeyedVectors
import re
import xml.etree.ElementTree as ET
import datetime

# Text cleaning

In [17]:
sentence_length = 100
vector_size = 300
train_path = 'v3.2/train/'
train_fileName1 = 'SemEval2016-Task3-CQA-QL-train-part1.xml'
train_fileName2 = 'SemEval2016-Task3-CQA-QL-train-part2.xml'
test_path = 'v3.2/test/'
test_fileName = 'SemEval2016-Task3-CQA-QL-test.xml'
word2vec_matrix = 'GoogleNews-vectors-negative300.bin'
cQQ_train_embedding_name = 'cQQ_train_embedding'
cQQ_test_embedding_name = 'cQQ_test_embedding'

In [18]:
class RawDataExtractor:
    def __init__(self, path, fileName):
        self.tree = ET.parse(path + fileName)
        self.root = self.tree.getroot()
    
    # Use this function to load QA related data ONLY!
    def extractInformation_QA(self, test = False, testSize = 5):
        '''
        This function returns a python dictionary which has the following structure:
        infoDic = {
            'Q1_R1': {
                'qTime': string
                'qTime_UNIX': float
                'qSubject': string
                'qUserID': string
                'qUserID_INT': int
                'qBody': string
                'comments': {
                    'cID': {
                        'cTime': string
                        'cTime_UNIX': float
                        'cUserID': string
                        'cUserID_INT': int
                        'cBody': string
                        'cLabel': string  
                        'cLabel_INT': int
                    }  
                    ...
                }
            }
            ...
        }
        '''
        infoDic = {}
        # For testing ONLY:
        count = 0
        for child in self.root:
            # For testing ONLY:
            if test:
                count += 1
                if count > testSize:
                    return infoDic
            # Get the question key
            currentQuestionKey = child.attrib.get('THREAD_SEQUENCE')
            # Get the question dictionary
            currentQuestion = self.extractSingleInformation_QA(child)
            if not infoDic.get(currentQuestionKey):
                infoDic[currentQuestionKey] = currentQuestion
            else:
                print('%s key has already existed. Info extraction failed...')
                return None
        return infoDic
    
    def extractSingleInformation_QA(self, child):
        singleInfoDic = {}
        singleInfoDic['comments'] = {}
        for index in range(len(child)):    
            # Question
            element = child[index]
            if index == 0 and element.attrib.get('RELQ_ID'):
                # Question Time
                qt = element.attrib.get('RELQ_DATE')
                singleInfoDic['qTime'] = qt
                dt = datetime.datetime.strptime(qt, '%Y-%m-%d %H:%M:%S')
                singleInfoDic['qTime_UNIX'] = dt.timestamp()
                # Question User ID, both string and int format
#                 singleInfoDic['qID'] = element.attrib.get('RELQ_ID')
                singleInfoDic['qUserID'] = element.attrib.get('RELQ_USERID')
                qUserID_int = int(singleInfoDic['qUserID'].replace('U', ''))
                singleInfoDic['qUserID_INT'] = qUserID_int
                # Question Subject
                singleInfoDic['qSubject'] = element[0].text
                # Question Body
                singleInfoDic['qBody'] = element[1].text   
            else:
                commentKey = element.attrib.get('RELC_ID') #cID
                singleInfoDic['comments'][commentKey] = {}
                # Comment Time
                ct = element.attrib.get('RELC_DATE')
                singleInfoDic['comments'][commentKey]['cTime'] = ct
                dt = datetime.datetime.strptime(ct, '%Y-%m-%d %H:%M:%S')
                singleInfoDic['comments'][commentKey]['cTime_UNIX'] = dt.timestamp()
                # Comment ID
                singleInfoDic['comments'][commentKey]['cUserID'] = element.attrib.get('RELC_USERID')
                cUserID_int = int(singleInfoDic['comments'][commentKey]['cUserID'].replace('U', ''))
                singleInfoDic['comments'][commentKey]['cUserID_INT'] = cUserID_int
                # Comment Body
                singleInfoDic['comments'][commentKey]['cBody'] = element[0].text
                # Comment Label
                label = element.attrib.get('RELC_RELEVANCE2RELQ')
                singleInfoDic['comments'][commentKey]['cLabel'] = label
                if label == 'Good':
                    singleInfoDic['comments'][commentKey]['cLabel_INT'] = 2
                elif label == 'Bad':
                    singleInfoDic['comments'][commentKey]['cLabel_INT'] = 0
                else:
                    singleInfoDic['comments'][commentKey]['cLabel_INT'] = 1
        return singleInfoDic

    # Use this function to load QQ related data ONLY!
    def extractInformation_QQ(self, test = False, testSize = 30):
        '''
        This function returns a python dictionary which has the following structure:
        infoDic = {
            'Q1': {
                'qTargetSubject': string
                'qTargetBody': string
                'availableQs': {
                    availableQID :{
                        'qTime': string
                        'qTime_UNIX': float
                        'qSubject': string
                        'qUserID': string
                        'qUserID_INT': int
                        'qBody': string
                        'qLabel': string
                        'qLabel_INT': int
                        'qRankingOrder': int
                        'qCategory': string
                    }
                    ...  
                }
            }
            ...
        }
        '''
        infoDic = {}
        
        # For testing ONLY:
        count = 0
        questionKeyRecorder = None
        
        for child in self.root:
            # For testing ONLY:
            if test:
                count += 1
                if count > testSize:
                    return infoDic
            # Get the question key
            currentQuestionKey = child.attrib.get('ORGQ_ID')
            # Get the question dictionary
            
            # A new target question
            if questionKeyRecorder == None or questionKeyRecorder != currentQuestionKey:
                questionKeyRecorder = currentQuestionKey
                infoDic[currentQuestionKey] = {}
                infoDic[currentQuestionKey]['qTargetSubject'] = child[0].text
                infoDic[currentQuestionKey]['qTargetBody'] = child[1].text
                infoDic[currentQuestionKey]['availableQs'] = {}
                
            # Handling the question
            availQKey = child[2].attrib.get('THREAD_SEQUENCE')
            infoDic[currentQuestionKey]['availableQs'][availQKey] = {}
            # Time        
            ct = child[2][0].attrib.get('RELQ_DATE')
            dt = datetime.datetime.strptime(ct, '%Y-%m-%d %H:%M:%S')
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qTime'] = ct
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qTime_UNIX'] = dt.timestamp()
            # User ID
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qUserID'] = child[2][0].attrib.get('RELQ_USERID')
            qUserID_int = int(child[2][0].attrib.get('RELQ_USERID').replace('U', ''))
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qUserID_INT'] = qUserID_int
            
            # Subject and Body
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qSubject'] = child[2][0][0].text
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qBody'] = child[2][0][1].text
            
            # Label
            currentLabel = child[2][0].attrib.get('RELQ_RELEVANCE2ORGQ')
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qLabel'] = currentLabel
            
            if currentLabel == 'PerfectMatch':
                infoDic[currentQuestionKey]['availableQs'][availQKey]['qLabel_INT'] = 2
            elif currentLabel == 'Irrelevant':
                infoDic[currentQuestionKey]['availableQs'][availQKey]['qLabel_INT'] = 0
            else:
                infoDic[currentQuestionKey]['availableQs'][availQKey]['qLabel_INT'] = 1
            
            # Other information
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qRankingOrder'] = int(child[2][0].attrib.get('RELQ_RANKING_ORDER'))
            infoDic[currentQuestionKey]['availableQs'][availQKey]['qCategory'] = child[2][0].attrib.get('RELQ_CATEGORY')
            
        return infoDic 
    

# Word embedding

In [19]:
class Text2Vec:
    def __init__(self, word2vec_model, vector_dim, sentence_length):
        self.model = word2vec_model
        self.vector_dim = vector_dim
        self.sentence_length = sentence_length
        self.pattern = re.compile(r"[^\w]")
        replace_op = lambda x: self.pattern.sub('', x)
        self.ops = [lambda x: x, lambda x: x.lower(), lambda x: x.capitalize(), lambda x: x.upper(), \
                   lambda x: replace_op(x), lambda x: replace_op(x).lower(), \
                    lambda x: replace_op(x).capitalize(), lambda x: replace_op(x).upper()]
        
    def embed_sentence(self, sentence):
        words = sentence.strip().split()
        vectors = []
        for w in words[:self.sentence_length]:
            for op in self.ops:
                new_w = op(w)
                if new_w in self.model.vocab:
                    vectors.append(self.model[new_w].reshape((1, -1)))
                    break
            else:
                vectors.append(np.random.uniform(low=-0.25, high=0.25, size=(1, self.vector_dim)))
        if len(vectors) < self.sentence_length:
            vectors.append(np.zeros((self.sentence_length - len(vectors), self.vector_dim)))
        return np.concatenate(vectors, axis=0).reshape(1, self.sentence_length, self.vector_dim)
    
    def build_matrix(self, raw_dict, save_dir):
        q_vectors = []
        a_vectors = []
        labels = []
        aug_data = []
        cid_list = []
        for thread in raw_dict.values():
            q_vector = self.embed_sentence((thread['qTargetSubject'] if thread['qTargetSubject'] else '') + \
                                           ' ' + thread['qTargetBody'] if thread['qTargetBody'] else '')
            tmp_time = []
            if len(thread['availableQs']) != 10:
                print('Invalid thread length: ', len(thread['availableQs']))
                continue
            for cid, comment in thread['availableQs'].items():
                q_vectors.append(q_vector)
                a_vectors.append(self.embed_sentence((comment['qSubject'] if comment['qSubject'] else '') + \
                                           ' ' + comment['qBody'] if comment['qBody'] else ''))
                labels.append(np.array([[comment['qLabel_INT'] / 2]]))
                cid_list.append(np.array([[cid]]))
                tmp_time.append(comment['qRankingOrder'])
            for order in np.argsort(tmp_time):
                aug = np.zeros((1, 10))
                aug[0, order] = 1.0
                aug_data.append(aug)
        q_vec = np.concatenate(q_vectors, axis=0)
        a_vec = np.concatenate(a_vectors, axis=0)
        label_vec = np.concatenate(labels, axis=0)
        aug_vec = np.concatenate(aug_data, axis=0)
        cid_vec = np.concatenate(cid_list, axis=0)
        np.savez(save_dir, q_vec, a_vec, aug_vec, label_vec, cid_vec)

In [20]:
w2v_model = KeyedVectors.load_word2vec_format(word2vec_matrix, binary=True)

In [21]:
t2v = Text2Vec(w2v_model, vector_size, sentence_length)

In [22]:
rDE1 = RawDataExtractor(train_path, train_fileName1)
raw_dict1 = rDE1.extractInformation_QQ(test = False, testSize = 50)
rDE2 = RawDataExtractor(train_path, train_fileName2)
raw_dict2 = rDE2.extractInformation_QQ(test = False, testSize = 50)
raw_dict = {**raw_dict1, **raw_dict2}
t2v.build_matrix(raw_dict, cQQ_train_embedding_name)

Invalid thread length:  9


In [23]:
rDE = RawDataExtractor(test_path, test_fileName)
raw_dict = rDE.extractInformation_QQ(test = False, testSize = 50)
t2v.build_matrix(raw_dict, cQQ_test_embedding_name)

You may print your generated embedding vectors using the code below.

In [24]:
npz_data = np.load(cQQ_train_embedding_name + '.npz')

In [25]:
npz_data['arr_2'][:30]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 