In [1]:
import spacy
import string
nlp = spacy.load('en_core_web_sm')
import json
import re
import neuralcoref
neuralcoref.add_to_pipe(nlp)

with open('../coqa-dev-v1.0.json','r') as reader:
    context_data = json.load(reader)['data']

    context = context_data[0]['story']

In [2]:
def space_extend(matchobj):
    return ' ' + matchobj.group(0) + ' '
def pre_proc(text):
    text = re.sub(
        u'-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|%|\[|\]|:|\(|\)|/|\t',
        space_extend, text)
    text = text.strip(' \n')
    text = re.sub('\s+', ' ', text)
    return text

def _str(s):
    """ Convert PTB tokens to normal tokens """
    if (s.lower() == '-lrb-'):
        s = '('
    elif (s.lower() == '-rrb-'):
        s = ')'
    elif (s.lower() == '-lsb-'):
        s = '['
    elif (s.lower() == '-rsb-'):
        s = ']'
    elif (s.lower() == '-lcb-'):
        s = '{'
    elif (s.lower() == '-rcb-'):
        s = '}'
    return s

def process(parsed_text):
    output = {'word': [], 'offsets': [], 'sentences': []}

    for token in parsed_text:
        output['word'].append(_str(token.text))
        output['offsets'].append((token.idx, token.idx + len(token.text)))

    word_idx = 0
    for sent in parsed_text.sents:
        output['sentences'].append((word_idx, word_idx + len(sent)))
        word_idx += len(sent)

    assert word_idx == len(output['word'])
    return output

def entity_rec(text):
    ners = [ent.text for ent in text.ents]
    return ners



In [3]:
def get_sentence(context):
    sentences = []
     #preprocess context
    refine_context = pre_proc(context)
    nlp_context = nlp(pre_proc(refine_context))
    annotated_text = process(nlp_context)
    for i in range(len(annotated_text['sentences'])):
        sentence_array = annotated_text['sentences'][i]
        sentence_text = nlp_context[sentence_array[0]:sentence_array[1]]
        sentences.append(str(sentence_text))            
    return sentences

In [4]:
sentences = get_sentence(context)

In [5]:
sentences[0]

'Once upon a time, in a barn near a farm house, there lived a little white kitten named Cotton.'

In [6]:
questions_text = context_data[0]['questions']

In [7]:
questions_text[1]['input_text']

'Where did she live?'

In [8]:
import gensim.models as gm
import nltk
import numpy as np
from scipy.linalg import norm


#model_file = './Bert4CoQA/model/CoQA_en_doc2vecModel.bin'
model_file = './model/CoQA_en_doc2vecModel.bin'
model = gm.Doc2Vec.load(model_file)

def vector_similarity(s1, s2):
    def sentence_vector(s):
        words = nltk.word_tokenize(s)
        v = np.zeros(256)
        v = model.infer_vector(words)
        v /= len(words)
        return v
    
    v1, v2 = sentence_vector(s1), sentence_vector(s2)
    return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [9]:
vector_similarity(questions_text[1]['input_text'], sentences[1])

0.42112762427316425

In [10]:
import random
import pandas as pd
from tqdm import tqdm

In [11]:
def generate_data(context_data):
    index_list = []
    question_list = []
    answer_list = []
    lable_list = []

    for i in tqdm(range(len(context_data))):
        index = i
        data = context_data[i]
        context = data['story']
        sentences = get_sentence(context)
        questions_text = data['questions']
        ground_true = data['answers']
        count = 0

        for i in range(len(questions_text)):
            current_question = questions_text[i]['input_text']
            current_True_answer = ground_true[i]['span_text'] 
            for j in sentences:
                if current_True_answer in j:
                    current_True_answer = j
                    index_list.append(index)
                    question_list.append(current_question)
                    answer_list.append(current_True_answer)
                    lable_list.append(1)
            # add the false sampling
            # if current_True_answer in sentences:
            #     rest = sentences.copy()
            #     rest.remove(current_True_answer)
            # else:
            #     rest = sentences
            # current_False_answer = rest[random.randint(0, len(rest)-1)]
            # question_list.append(current_question)
            # answer_list.append(current_False_answer)
            # lable_list.append(0)
    pair_df = pd.DataFrame({'index': index_list, 'question': question_list, 'evidence':answer_list, 'lable':lable_list})
    pair_df.to_csv('/ssd2/wangzd/code/outpt/assessment/dev/test_evidence.csv', mode = 'a')

In [12]:
generate_data(context_data)

 16%|█▌        | 78/500 [00:18<01:39,  4.25it/s]


KeyboardInterrupt: 

In [13]:
    [ 0 for _ in range(10)]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [14]:
def assessment(index, query, context):
    context = context_data[index]['story']
    curr_sentences = get_sentence(context)
    assess_matrix = [0 for _ in range(len(curr_sentences))]
    for i in range(curr_sentences):
        assess_matrix[i] = vector_similarity(query, context)
    sorted(assess_matrix)[-1]

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable

In [16]:
class DSSM(torch.nn.Module):
    def __init__(self):
        super(DSSM,self).__init__()
        self.embedding=nn.Embedding(CHAR_SIZE,embedding_size)
        self.linear1=nn.Linear(embedding_size,256)
        self.linear2=nn.Linear(256,128)
        self.linear3=nn.Linear(128,64)
        self.dropout=nn.Dropout(p=0.2)
        
    def forward(self,a,b):
        #将各索引id的embedding向量相加
        a=self.embedding(a).sum(1)
        b=self.embedding(b).sum(1)
        
        a=torch.tanh(self.linear1(a))
        a=self.dropout(a)
        a=torch.tanh(self.linear2(a))
        a=self.dropout(a)
        a=torch.tanh(self.linear3(a))
        a=self.dropout(a)
        
        b=torch.tanh(self.linear1(b))
        b=self.dropout(b)
        b=torch.tanh(self.linear2(b))
        b=self.dropout(b)
        b=torch.tanh(self.linear3(b))
        b=self.dropout(b)
        
        cosine=torch.cosine_similarity(a,b,dim=1,eps=1e-8)
        return cosine
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform_(m.weight)

In [17]:
CHAR_SIZE=2000
embedding_size=300

EPOCH=500
BATCH_SIZE=1
LR=0.0005

In [14]:
data = pd.read_csv('/ssd2/wangzd/code/Bert4CoQA/assessment_dev/test_evidence_all.csv')


In [18]:
def sentence_vector(s):
    words = nltk.word_tokenize(s)
    v = np.zeros(2000)
    v = model.infer_vector(words)
    #v /= len(words)
    return v

In [19]:
pd.read_csv('/ssd2/wangzd/code/Bert4CoQA/assessment_dev/test_evidence_all.csv')

Unnamed: 0.1,Unnamed: 0,index,question,evidence,lable
0,0.0,0,What color was Cotton?,"Once upon a time, in a barn near a farm house,...",1
1,1.0,0,Where did she live?,"Once upon a time, in a barn near a farm house,...",1
2,2.0,0,Did she live alone?,But Cotton wasn't alone in her little home abo...,1
3,3.0,0,Who did she live with?,She shared her hay bed with her mommy and 5 ot...,1
4,4.0,0,What color were her sisters?,The rest of her sisters were all orange with b...,1
...,...,...,...,...,...
1977,7.0,0,Whose paint was it?,"So one day, when Cotton found a can of the old...",1
1978,8.0,0,What did Cotton's mother and siblings do when ...,When her mommy and sisters found her they star...,1
1979,9.0,0,Where did Cotton's mother put her to clean the...,"And with that, Cotton's mommy picked her up an...",1
1980,10.0,0,What did the other cats do when Cotton emerged...,Her sisters licked her face until Cotton's fur...,1


In [20]:
data['question'][3]

NameError: name 'data' is not defined

In [21]:
string = 'I love you'

In [22]:
sentence_vector(string)

array([ 0.00726436, -0.02657673,  0.03555737, ...,  0.00562229,
        0.01745546, -0.02168034], dtype=float32)

In [27]:
def load_data(path):
    data = pd.read_csv(path)
    context_mt = []
    query_mt = []
    evidence_mt = []

    for i in tqdm(range(len(data))):
        if int(data['index'][i]) != 'index':
            sentences_mt = []
            index = int(data['index'][i])
            context = context_data[index]['story']
            curr_sentences = get_sentence(context)
            for j in range(len(curr_sentences)):
                sentences_mt.append(sentence_vector(curr_sentences[j]).reshape(1,CHAR_SIZE))
            context_mt.append(sentences_mt)
            query_mt.append(sentence_vector(data['question'][i]).reshape(1,CHAR_SIZE))
            evidence_mt.append(curr_sentences.index(data['evidence'][i]))
    return np.array(context_mt), np.array(query_mt), np.array(evidence_mt)

In [28]:
class CoQADataset(Dataset):
    def __init__(self,filepath):
        self.path=filepath
        self.context_mt,self.query_mt,self.evidence_mt=load_data(filepath)
    def __getitem__(self, idx):
        return self.context_mt[idx],self.query_mt[idx],self.evidence_mt[idx]
    def __len__(self):
        return len(self.context_mt)

import pickle

output = open('/ssd2/wangzd/code/outpt/assessment/dev/data.pkl', 'wb')

pickle.dump(data, output)

测试数据类

test = CoQADataset('/ssd2/wangzd/code/Bert4CoQA/assessment_dev/test_evidence0.csv')

创建数据集

In [29]:
data_root = '/ssd2/wangzd/code/Bert4CoQA/assessment_dev/'
train_path = data_root+'test_evidence0.csv'
test_path = data_root+'test_evidence0_new.csv'

In [30]:
#1、创建数据集并创立数据载入器
#注意修改测试集
train_data=CoQADataset(train_path)
test_data=train_data

100%|██████████| 198/198 [01:04<00:00,  3.09it/s]


In [108]:
train_data[0][0]

SyntaxError: unexpected EOF while parsing (<ipython-input-108-0ecb6ea30f69>, line 1)

In [115]:
train_data[0][2]

0

TypeError: Variable data has to be a tensor, but got numpy.ndarray

In [116]:
#1、创建数据集并创立数据载入器
#注意修改测试集
train_loader=DataLoader(dataset=train_data,batch_size=BATCH_SIZE,shuffle=True)
test_loader=DataLoader(dataset=test_data,batch_size=BATCH_SIZE,shuffle=True)

#2、有gpu用gpu，否则cpu
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
dssm=DSSM().to(device)
dssm._initialize_weights()

#3、定义优化方式和损失函数
optimizer=torch.optim.Adam(dssm.parameters(),lr=LR)
loss_func=nn.CrossEntropyLoss()

In [117]:
import torch.nn.functional as F

In [118]:
for epoch in range(EPOCH):
    for step,(context,query,label) in enumerate(train_loader):
        #1、把索引转化为tensor变量，载入设备，注意转化成long tensor
        a=Variable(context.to(device).long())
        b=Variable(query.to(device).long())
        l=Variable(torch.LongTensor(label).to(device))
        #2、计算余弦相似度
        a = a.chunk(27, 1)
        x = [dssm(a[i], b) for i in range(len(a))]
        re_uni = torch.cat(x, 1) 
        #3、预测结果传给loss
        out = F.softmax(re_uni, dim=1)[:,l.item()]
        loss = -torch.log(torch.prod(out))
        
        #4、固定格式
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (step+1) % 20 == 0:
            total=0
            correct=0
            for (test_a,test_b,test_l) in test_loader:
                tst_a=Variable(context.to(device).long())
                tst_b=Variable(query.to(device).long())
                tst_l=Variable(torch.LongTensor(test_l).to(device))
                x = [dssm(a[i], b) for i in range(len(a))]
                re_uni = torch.cat(x, 1)
                out=torch.max(F.softmax(re_uni, dim=1),1)[1]
                if out.size()==tst_l.size():
                    total+=tst_l.size(0)
                    correct+=(out==tst_l).sum().item()
            print('[Epoch]:',epoch+1,'训练loss:',loss.item())
            print('[Epoch]:',epoch+1,'测试集准确率: ',(correct*1.0/total))


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object

In [None]:
torch.save(dssm, './dssm.pkl')