In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
good_json_path = './results/0784.json'
bad_json_path = './results/0751.json'

In [3]:
with open(good_json_path,'r') as f:
    good_json = json.load(f)
with open(bad_json_path,'r') as f:
    bad_json = json.load(f)

In [6]:
file_names = list(good_json.keys())

In [15]:
analysis_dict = {}
for file in file_names:
    good_result = good_json[file]['result'][0]['labels']
    bad_result = bad_json[file]['result'][0]['labels']
    diff_temp = []
    for label in good_result:
        if(label not in bad_result):
            diff_temp.append(label)
        analysis_dict[file] = diff_temp
    #break

In [22]:
# 在good result中出现而bad result中没有的label
dist = {}
for k,v in analysis_dict.items():
    for l in v:   
        if l not in dist.keys():
            dist[l] = 1
        else:
            dist[l] += 1

In [23]:
dist

{'工作职场': 134,
 '场景-其他': 225,
 '愤怒': 288,
 '转场': 453,
 '办公室': 383,
 '特写': 448,
 '公园': 10,
 '远景': 63,
 '影棚幕布': 349,
 '拉近': 952,
 '全景': 564,
 '惊奇': 437,
 '手机电脑录屏': 338,
 '手写解题': 166,
 '才艺展示': 93,
 '家': 589,
 '宫格': 338,
 '混剪': 487,
 '幻灯片轮播': 291,
 '室外': 399,
 '知识讲解': 371,
 '填充': 278,
 '喜悦': 235,
 '亲子': 167,
 '家庭伦理': 284,
 '夫妻&恋人&相亲': 359,
 '悲伤': 379,
 '单人情景剧': 184,
 '动态': 421,
 '极端特写': 471,
 '情景演绎': 550,
 '朋友&同事(平级)': 472,
 '室内': 501,
 '配音': 234,
 '采访': 13,
 '(马路边的)人行道': 57,
 '多人情景剧': 99,
 '过渡页': 214,
 '路人': 249,
 '课件展示': 188,
 '学校': 150,
 '单人口播': 190,
 '教师(教授)': 176,
 '重点圈画': 79,
 '上下级': 67,
 '城市道路': 9,
 '拉远': 176,
 '教辅材料': 357,
 '红包': 70,
 '餐厅': 18,
 '动画': 210,
 '汽车内': 29,
 '图文快闪': 133,
 '游戏画面': 34,
 '绘画展示': 30,
 '商品展示': 47,
 '亲戚(亲情)': 97,
 '古代': 18,
 '城市景观': 20,
 '厌恶': 26,
 '企业家': 16,
 '多人口播': 23,
 '演播室': 21,
 '商场': 9,
 '医生': 1,
 '励志逆袭': 3,
 '平静': 8,
 '医院': 5,
 '过道': 6,
 '天空': 6,
 '门口': 6,
 '咖啡厅': 1,
 '中景': 2,
 '房屋外': 2,
 '大厅': 1,
 '停车场': 3,
 '外卖': 1}

# Bert finetune

In [24]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [25]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
device = 'cpu'

## 处理数据

In [112]:
import os
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import re
import linecache
import dataloader.tokenization as tokenization

class MultimodaFeaturesDataset(Dataset):

    def __init__(self,dataset_config,job='training'):
        
        self.data_num_per_sample = 6 # 在train.txt中每个sample占6行
        self.device = dataset_config['device']
        if(job=='training'):
            self.meta_path = dataset_config['train_data_path']
        elif(job=='valdation'):
            self.meta_path = dataset_config['val_data_path']
        else:
            self.meta_path = dataset_config['test_data_path']
        self.label2id = {}
        with open(dataset_config['label_id_path'],'r') as f:
            for line in f:
                line = line.strip('\r\n')
                line = line.split('\t')
                self.label2id[line[0]] = int(line[1])
    def __getitem__(self, index):
        # 1. 从train.txt读取对应 idx 的path
        data_list = [] # 存储对于index的各个模态数据的路径和样本标签
        for line_i in range(self.data_num_per_sample*index+1,self.data_num_per_sample*(index+1)):
            line = linecache.getline(self.meta_path,line_i)
            line = line.strip('\r\n')
            data_list.append(line)
        text_ids,label_ids = self.preprocess(data_list[-2:])
        
        return text_ids,label_ids
    def __len__(self):
        # TODO 不能固定长度
        with open(self.meta_path,'r') as f:
            lines = f.readlines()
        return len(lines)//self.data_num_per_sample
    def preprocess(self,data_list):
        
        text_path,label = data_list
        #--------------- text ----------------#
        
        text = ''
        with open(text_path,'r') as f:
            for line in f:
                dic = eval(line)
        for key in dic:
            dic[key] = ''.join(re.findall('[\u4e00-\u9fa5]',dic[key]))
            text += dic[key]
        #--------------- label ----------------#
        label_ids = []
        label = label.split(',')
        np.random.shuffle(label)
        for i in label:
            label_ids.append(self.label2id[i])
        # label_ids = torch.tensor(np.array(label_ids).astype('int64'))
        dense_label_ids = torch.zeros(82,dtype=torch.int64)# ,dtype=torch.int64)
        dense_label_ids[label_ids] = 1
        # return video,audio,label_ids
        return text,dense_label_ids.tolist()

In [113]:
import yaml
config_path = './config/mac_local_config.yaml'
config = yaml.load(open(config_path))
dataset = MultimodaFeaturesDataset(config['DatasetConfig'],job='training')

In [114]:
data = {}
text_array = []
label_array = []
for i in range(len(dataset)):
    text,dense_label_ids = dataset[i]
    text_array.append(text)
    label_array.append(dense_label_ids)

In [115]:
data={'text':text_array,'labels':label_array}
df = pd.DataFrame(data)

In [116]:
df.to_csv('./bert_finetune_data.csv',index=False)

In [117]:
new_df = pd.read_csv('./bert_finetune_data.csv')

In [121]:
new_df.head()

Unnamed: 0,text,labels
0,希望你过上想要的生活既可以朝九晚五别急学会配音后是这样的其实你真的既可以朝九晚五又可以浪迹天...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,都要结婚了怎么手上什么都没有呀媳妇儿我来接你啦阿杰这门好进新娘可不好接哦这红包呀我早都给你们...,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,我是邓诚猿辅导数学学科带头人毕业于清华大学专注于数学教育九年新学期快要开始了猩很多家长朋友都...,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,你娃成天玩手机呢学习听还那么好我给他报了猿辅导网课学习呢贵环节课只要元钱还送这么多学习资料我...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,什么什么课不许报你报的辅导班还不够多呀行了先这样我先忙挂了啊哎我告诉你啊你要是敢报我就断你生...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [90]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', truncation=True)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=109540.0), HTML(value='')))




In [142]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        new_targets=[int(s) for s in self.targets[index][1:-1].split(',')]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(new_targets, dtype=torch.float)
        }

In [143]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [147]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [154]:
from transformers import BertModel
bert_path ='/Users/mafp/code/taac/pretrained/bert'
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained(bert_path)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 82)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to('cpu')

DistilBERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [155]:
import torch
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [160]:

def train(epoch):
    model.train()
    device = 'cpu'
    for _,data in enumerate(tqdm(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [161]:
for epoch in range(EPOCHS):
    train(epoch)

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss:  0.6414496898651123


  9%|▉         | 90/1000 [10:24<1:45:15,  6.94s/it]


KeyboardInterrupt: 