In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
good_json_path = './results/0784.json'
bad_json_path = './results/0751.json'

In [3]:
with open(good_json_path,'r') as f:
    good_json = json.load(f)
with open(bad_json_path,'r') as f:
    bad_json = json.load(f)

In [6]:
file_names = list(good_json.keys())

In [15]:
analysis_dict = {}
for file in file_names:
    good_result = good_json[file]['result'][0]['labels']
    bad_result = bad_json[file]['result'][0]['labels']
    diff_temp = []
    for label in good_result:
        if(label not in bad_result):
            diff_temp.append(label)
        analysis_dict[file] = diff_temp
    #break

In [22]:
# 在good result中出现而bad result中没有的label
dist = {}
for k,v in analysis_dict.items():
    for l in v:   
        if l not in dist.keys():
            dist[l] = 1
        else:
            dist[l] += 1

In [23]:
dist

{'工作职场': 134,
 '场景-其他': 225,
 '愤怒': 288,
 '转场': 453,
 '办公室': 383,
 '特写': 448,
 '公园': 10,
 '远景': 63,
 '影棚幕布': 349,
 '拉近': 952,
 '全景': 564,
 '惊奇': 437,
 '手机电脑录屏': 338,
 '手写解题': 166,
 '才艺展示': 93,
 '家': 589,
 '宫格': 338,
 '混剪': 487,
 '幻灯片轮播': 291,
 '室外': 399,
 '知识讲解': 371,
 '填充': 278,
 '喜悦': 235,
 '亲子': 167,
 '家庭伦理': 284,
 '夫妻&恋人&相亲': 359,
 '悲伤': 379,
 '单人情景剧': 184,
 '动态': 421,
 '极端特写': 471,
 '情景演绎': 550,
 '朋友&同事(平级)': 472,
 '室内': 501,
 '配音': 234,
 '采访': 13,
 '(马路边的)人行道': 57,
 '多人情景剧': 99,
 '过渡页': 214,
 '路人': 249,
 '课件展示': 188,
 '学校': 150,
 '单人口播': 190,
 '教师(教授)': 176,
 '重点圈画': 79,
 '上下级': 67,
 '城市道路': 9,
 '拉远': 176,
 '教辅材料': 357,
 '红包': 70,
 '餐厅': 18,
 '动画': 210,
 '汽车内': 29,
 '图文快闪': 133,
 '游戏画面': 34,
 '绘画展示': 30,
 '商品展示': 47,
 '亲戚(亲情)': 97,
 '古代': 18,
 '城市景观': 20,
 '厌恶': 26,
 '企业家': 16,
 '多人口播': 23,
 '演播室': 21,
 '商场': 9,
 '医生': 1,
 '励志逆袭': 3,
 '平静': 8,
 '医院': 5,
 '过道': 6,
 '天空': 6,
 '门口': 6,
 '咖啡厅': 1,
 '中景': 2,
 '房屋外': 2,
 '大厅': 1,
 '停车场': 3,
 '外卖': 1}

# Bert finetune

In [24]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [25]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
device = 'cpu'

## 处理数据

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import re
import linecache
import dataloader.tokenization as tokenization

class MultimodaFeaturesDataset(Dataset):

    def __init__(self,dataset_config,job='training'):
        
        self.data_num_per_sample = 6 # 在train.txt中每个sample占6行
        self.device = dataset_config['device']
        if(job=='training'):
            self.meta_path = dataset_config['train_data_path']
        elif(job=='valdation'):
            self.meta_path = dataset_config['val_data_path']
        else:
            self.meta_path = dataset_config['test_data_path']
        self.label2id = {}
        with open(dataset_config['label_id_path'],'r') as f:
            for line in f:
                line = line.strip('\r\n')
                line = line.split('\t')
                self.label2id[line[0]] = int(line[1])
    def __getitem__(self, index):
        # 1. 从train.txt读取对应 idx 的path
        data_list = [] # 存储对于index的各个模态数据的路径和样本标签
        for line_i in range(self.data_num_per_sample*index+1,self.data_num_per_sample*(index+1)):
            line = linecache.getline(self.meta_path,line_i)
            line = line.strip('\r\n')
            data_list.append(line)
        text_ids,label_ids = self.preprocess(data_list[-2:])
        
        return text_ids,label_ids
    def __len__(self):
        # TODO 不能固定长度
        with open(self.meta_path,'r') as f:
            lines = f.readlines()
        return len(lines)//self.data_num_per_sample
    def preprocess(self,data_list):
        
        text_path,label = data_list
        #--------------- text ----------------#
        
        text = ''
        with open(text_path,'r') as f:
            for line in f:
                dic = eval(line)
        for key in dic:
            dic[key] = ''.join(re.findall('[\u4e00-\u9fa5]',dic[key]))
            text += dic[key]
        #--------------- label ----------------#
        label_ids = []
        label = label.split(',')
        np.random.shuffle(label)
        for i in label:
            label_ids.append(self.label2id[i])
        # label_ids = torch.tensor(np.array(label_ids).astype('int64'))
        dense_label_ids = torch.zeros(82,dtype=torch.int64)# ,dtype=torch.int64)
        dense_label_ids[label_ids] = 1
        # return video,audio,label_ids
        return text,dense_label_ids.tolist()

In [2]:
import yaml
config_path = './config/config.yaml'
config = yaml.load(open(config_path))
dataset = MultimodaFeaturesDataset(config['DatasetConfig'],job='training')

  app.launch_new_instance()


In [3]:
data = {}
text_array = []
label_array = []
for i in range(len(dataset)):
    text,dense_label_ids = dataset[i]
    text_array.append(text)
    label_array.append(dense_label_ids)

In [4]:
data={'text':text_array,'labels':label_array}
df = pd.DataFrame(data)

In [5]:
df.to_csv('./bert_finetune_train_data.csv',index=False)

In [6]:
new_df = pd.read_csv('./bert_finetune_train_data.csv')

In [9]:
new_df.head()

Unnamed: 0,text,labels
0,再见我家孩子岁了现在已经有英文词汇量不仅能进行日常对话还能自己阅读像小王子这样的英文原著有时...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,快财商学院院长九坤请输入手机号本人已阅读并同意用户使用协议低薪族月光一族家庭主妇没有投资技巧...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,妈录取通知书到啦哎呀宝贝你真棒多亏了妈给我报了高途课堂高中全科名师班那当然平均教龄有年清华北...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,上现在报名加送全套语文教辅大礼包包邮到家非赠品拼音为系统课课程趣味语文启蒙课元节课岁对辅导大...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,今天今天就吃这个呀馄饨你吃我喝点汤就行你怎么那么抠啊你每个月工资不是挺高的吗就这碗馄饨我还是...,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('../pretrained/bert', truncation=True)

In [12]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        new_targets=[int(s) for s in self.targets[index][1:-1].split(',')]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(new_targets, dtype=torch.float)
        }

In [13]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
from transformers import BertModel
bert_path ='../pretrained/bert'
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained(bert_path)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 82)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to('cuda')

Some weights of the model checkpoint at ../pretrained/bert were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [16]:
import torch
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [21]:
from tqdm import tqdm
def train(epoch):
    model.train()
    device = 'cuda'
    for _,data in enumerate(tqdm(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [24]:
for epoch in range(EPOCHS):
    train(epoch)

  0%|          | 2/900 [00:00<01:08, 13.07it/s]

Epoch: 0, Loss:  0.22668682038784027


100%|██████████| 900/900 [01:10<00:00, 12.80it/s]
  0%|          | 2/900 [00:00<01:12, 12.36it/s]

Epoch: 1, Loss:  0.2579690217971802


100%|██████████| 900/900 [01:10<00:00, 12.82it/s]
  0%|          | 2/900 [00:00<01:03, 14.25it/s]

Epoch: 2, Loss:  0.17314724624156952


100%|██████████| 900/900 [01:10<00:00, 12.81it/s]
  0%|          | 2/900 [00:00<01:14, 11.98it/s]

Epoch: 3, Loss:  0.1904550939798355


100%|██████████| 900/900 [01:10<00:00, 12.83it/s]
  0%|          | 2/900 [00:00<01:14, 12.10it/s]

Epoch: 4, Loss:  0.19509835541248322


100%|██████████| 900/900 [01:10<00:00, 12.83it/s]
  0%|          | 2/900 [00:00<01:11, 12.59it/s]

Epoch: 5, Loss:  0.1905200183391571


100%|██████████| 900/900 [01:09<00:00, 12.86it/s]
  0%|          | 2/900 [00:00<01:07, 13.24it/s]

Epoch: 6, Loss:  0.17055803537368774


100%|██████████| 900/900 [01:10<00:00, 12.83it/s]
  0%|          | 2/900 [00:00<01:10, 12.66it/s]

Epoch: 7, Loss:  0.14792343974113464


100%|██████████| 900/900 [01:10<00:00, 12.77it/s]
  0%|          | 2/900 [00:00<01:11, 12.52it/s]

Epoch: 8, Loss:  0.2479400485754013


100%|██████████| 900/900 [01:10<00:00, 12.83it/s]
  0%|          | 2/900 [00:00<01:09, 13.01it/s]

Epoch: 9, Loss:  0.1695108413696289


100%|██████████| 900/900 [01:10<00:00, 12.82it/s]


In [None]:
torch.save(model.state_dict(),'./new_fine_bert.bin')

# 验证训练集是否和测试集有重复，以及是哪些重复了

In [14]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
import yaml
import json
import torch
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import utils.train_util as train_util
from dataloader.dataloader import TestingDataset,MultimodaFeaturesDataset
from src.loss.loss_compute import SimpleLossCompute
from src.model.baseline_model import Baseline
from src.loop.run_epoch import training_loop,validating_loop

config_path = './config/config.yaml'
config = yaml.load(open(config_path))
train_dataset = MultimodaFeaturesDataset(config['DatasetConfig'],job='training')
val_dataset = MultimodaFeaturesDataset(config['DatasetConfig'],job='valdation')
test_dataset = TestingDataset(config['DatasetConfig'])




In [15]:
train_files = []
val_files = []
test_files = []

for i in range(len(train_dataset)):
    file_path = train_dataset[i]
    video_name = os.path.basename(file_path)
    train_files.append(video_name[:-4])
    
for i in range(len(val_dataset)):
    file_path = val_dataset[i]
    video_name = os.path.basename(file_path)
    val_files.append(video_name[:-4])

for i in range(len(test_dataset)):
    video_name = test_dataset[i]['file_name']
    test_files.append(video_name[:-4])    

In [16]:
test_dataset[0]['file_name']

'0003c0066fa7436317d8c507c596680e.mp4'

In [17]:
repetition = 0
for test_video in test_files:
    if(test_video in train_files):
        repetition += 1
        print(test_video)

01698748d5eb5a400928d7c19ea81f05
01896d58bee40a8bc9ad4d605e43a7f2
018faa4c9a2fe2039bd9e551337f3baa
01ba9504a8d13f7e569822c02de17ca5
020648c6b6b1a309ac20742146b3c285
03a09d55fe535c01a2fb26bd4b383f30
03d6f3f2ffd62eb02f67f0cf63495b1d
042b9569889d7b6a7bfb854e76790e78
05064eba0f9b3662bb1be55fae155ef9
0582f22c42de8dbd938cef9bef053f6a
05ee76bd52b4b30a92d75f1fdb1b38ec
0628ea304ceafae67f41bff67f893f9f
07732170d872d045c7d82728f15d387c
07c3f9a4f5370e49fd61b19d6d52f2ae
080bcac6588f15ee731b6b3a6c05ef39
089dd3cce10a427e40eb86b57c90a27e
08e19ce33594afdd6cc7cc402967edba
09449d469b0a896583aa1071503ce474
0a550ed1bd06760e0d5ef105dd9c6921
0bc09c284e92352434216f6ee03bd4da
0c3fe11dff809fd91bd923c70d37eba1
0c5102b1f3d204f4bc92b37aef6b3b82
0c71334b1c30883e26a9e121c1acc4d8
0cec6fb198954d807ad93aca5eef6171
0cf8296433c9a93fce2a89658c47c4b6
0d0848a53d0d98b0b3f5d8cbb3291176
0d091c55afd7668d84f26897a96e6a0b
0db6b7f9a40e4f8c7618949dabdbbace
0f67097595b33b9b925523633144fde1
100e893ea9a97ddb1031d8d3a5d486c2
106f9047e8

In [18]:
repetition

318

In [19]:
val_repetition = 0
for test_video in test_files:
    if(test_video in val_files):
        val_repetition += 1
        print(test_video)

0139022fc40eda52f64802cf633f844d
0362216e043df96fbb844c42aa8b4c93
068462a2dd49dd4a32330afab6ffa16a
0c2783714511ca553b66d05c8f42829e
178cc3096cac02d989c6d8115e6ea280
29f655401cb53f2b294c76ee7888b274
2f735757ba61aa4592a154b0d3751aa5
3160da81d50d2cde2afee15d01cdeb00
3a2a9330ab97cdd48a95dcd2825277a0
3f7dd413d181f723b75e942ff3ffaf6f
41ba2f5daadfa9167520be2f1c1d3ea7
4395efccdc46626f567c0a38886c8508
46c8b8ad851a68056d98f6e3eec180d3
47f81e2aea31b563e41e52159a7fe00b
50995b605629917f35a03ceee827643b
5934695826b00d02eb51e3af138245a7
5ddc4b557489078ef87b747f4a27c2de
65e5b9e7bc7ad8641795daeabd68440d
684ddda4b51db3d024b2424f2af60340
77cb6bd7f3ecf89ac8f01c9a1a1478cc
7b4367d96ca18f65b699bebc09a4f24e
7c776fe18c1fdc9e59140ba10c51caca
83d2cc734704317225fce295b7c341a6
84475335462275017714cb7c1e7b050f
8977c7cc57619177de9bd7596c5a1f17
8a8b864ac3ff5d9419d465bbcd55e310
9154f7ba3f29cd8f13692c765baa3a1c
9588022618eb6cf18671f0c3b1456045
9738d941cdb0283a88d2e108e9df4ed0
9785dbff03ca0d33a935fafec6009cad
a85c08888e

In [20]:
val_repetition

42