In [2]:
!pip install jieba

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting jieba
  Downloading http://mirrors.tencentyun.com/pypi/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz (19.2 MB)
[K     |████████████████████████████████| 19.2 MB 783 kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314478 sha256=2cf68e2c7eadaaa16b73f320033d4344c819555a06e50ce0b70d2b6c6439ce34
  Stored in directory: /home/tione/.cache/pip/wheels/de/99/39/55dd43d023169a4464b9118a252e188367c3750c62526c46f3
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCnn(nn.Module):
    def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout = 0.5):
        super(TextCnn, self).__init__()

        Ci = 1
        Co = kernel_num

        self.embed = nn.Embedding(embed_num, embed_dim)
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (f, embed_dim), padding = (2, 0)) for f in kernel_sizes])

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(Co * len(kernel_sizes), class_num)

    def forward(self, x):
        x = self.embed(x)  # (N, token_num, embed_dim)
        x = x.unsqueeze(1)  # (N, Ci, token_num, embed_dim)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, token_num) * len(kernel_sizes)]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co) * len(kernel_sizes)]
        x = torch.cat(x, 1) # (N, Co * len(kernel_sizes))
        x = self.dropout(x)  # (N, Co * len(kernel_sizes))
        logit = self.fc(x)  # (N, class_num)
        return logit

In [5]:
import jieba

In [None]:
class MultimodaFeaturesDataset(Dataset):

    def __init__(self,dataset_config,job='training'):
        
        self.data_num_per_sample = 6 # 在train.txt中每个sample占6行
        self.text_max_len = dataset_config['text_max_len']
        self.device = dataset_config['device']
        
        if(job=='training'):
            self.meta_path = dataset_config['train_data_path']
        elif(job=='valdation'):
            self.meta_path = dataset_config['val_data_path']
        else:
            self.meta_path = dataset_config['test_data_path']
        self.tokenizer = BertTokenizer.from_pretrained(dataset_config['bert_path'])
        self.label2id = {}
        with open(dataset_config['label_id_path'],'r') as f:
            for line in f:
                line = line.strip('\r\n')
                line = line.split('\t')
                self.label2id[line[0]] = int(line[1])
    def __getitem__(self, index):
        # 1. 从train.txt读取对应 idx 的path
        data_list = [] # 存储对于index的各个模态数据的路径和样本标签
        for line_i in range(self.data_num_per_sample*index+1,self.data_num_per_sample*(index+1)):
            line = linecache.getline(self.meta_path,line_i)
            line = line.strip('\r\n')
            data_list.append(line)
        video,audio,text_ids,text_attention_mask,label_ids = self.preprocess(data_list)
        return video,audio,text_ids,text_attention_mask,label_ids
    def __len__(self):
        # TODO 不能固定长度
        with open(self.meta_path,'r') as f:
            lines = f.readlines()
        return len(lines)//self.data_num_per_sample
    def preprocess(self,data_list):
        
        video_path,audio_path,image_path,text_path,label = data_list
        
        #--------------- video ----------------#
        video = torch.tensor(np.load(video_path).astype(np.float32))
        
        #--------------- audio ----------------#
        if os.path.exists(audio_path):
            audio = torch.tensor(np.load(audio_path).astype(np.float32))
        else:
            audio = torch.tensor(np.random.random((video.shape[0],128)).astype(np.float32))
            
        #--------------- text ----------------#
        
        text = ''
        with open(text_path,'r') as f:
            for line in f:
                dic = eval(line)
           
        for key in dic:
            dic[key] = ''.join(re.findall('[\u4e00-\u9fa5]',dic[key]))
            text += dic[key]
        
        # text = ''.join(re.findall('[\u4e00-\u9fa5]',dic['video_asr']))
        inputs = 
        text_ids = inputs['input_ids']
        text_attention_mask = inputs['attention_mask']
        text_ids = torch.tensor(np.array(text_ids).astype('int64'))
        text_attention_mask = torch.tensor(np.array(text_attention_mask).astype('int64'))
        #--------------- label ----------------#
        label_ids = []
        label = label.split(',')
        np.random.shuffle(label)
        for i in label:
            label_ids.append(self.label2id[i])
        # label_ids = torch.tensor(np.array(label_ids).astype('int64'))
        dense_label_ids = torch.zeros(82)# ,dtype=torch.int64)
        dense_label_ids[label_ids] = 1
        # return video,audio,label_ids
        return video,audio,text_ids,text_attention_mask,dense_label_ids
    
    def collate_fn(self,batch):
        # 自定义dataloader 对一个batch的处理方式
        # 需要完成的任务有：
        # 1. 对video和audio的序列进行padding
        # 2. 对text，label_ids同样padding
        video_stacks = []
        audio_stacks = []
        text_stacks = []
        label_stacks = []
        text_attention_stacks = []
        for i in batch:
            video_stacks.append(i[0])
            audio_stacks.append(i[1])
            text_stacks.append(i[2])
            text_attention_stacks.append(i[3])
            label_stacks.append(i[4])
        
        video_stacks = pad_sequence(video_stacks,batch_first=True,padding_value=0)
        audio_stacks = pad_sequence(audio_stacks,batch_first=True,padding_value=0)
        text_stacks = pad_sequence(text_stacks,batch_first=True,padding_value=0) # 实际上没有pad
        # 实际上并没有padding，因为label变成multi-hot向量，长度都是82
        label_stacks = pad_sequence(label_stacks,batch_first=True,padding_value=0) 
        text_attention_stacks = pad_sequence(text_attention_stacks,batch_first=True,padding_value=0) # 实际上也没有pad
        return video_stacks,audio_stacks,text_stacks,text_attention_stacks,label_stacks
        # return video_stacks,audio_stacks,label_stacks
    def tokenize(text):
        return [word for word in jieba.cut(text) if word.strip()]