In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from pytorch_pretrained_bert import BertConfig

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from random import randrange
import torch.nn.functional as F
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
from argparse import ArgumentParser
import torch.multiprocessing as mp
from sklearn.model_selection import train_test_split

BERT_MODEL_PATH = '/mnt/bert_model/chinese_L-12_H-768_A-12'

bert_config = BertConfig('/mnt/bert_model/chinese_L-12_H-768_A-12/bert_config.json')
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file /mnt/bert_model/chinese_L-12_H-768_A-12/vocab.txt


In [2]:
train_path = '/mnt/news_virus/train.csv'
test_path = '/mnt/news_virus/test_dataset.csv'

In [29]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [60]:
train_df = train_df.sample(frac=1)

In [61]:
train_df.head(1)

Unnamed: 0,id,content,picture_lists,category,ncw_label,fake_label,real_label,comment_2c,comment_all
16973,3595572676621165,【军车就是了不起！有本事你找我茬啊？】一网友，路过南京市中山陵前湖路口，看见一挂军牌车辆逆向...,c98dea6573dfe82f87c222ca401235b5.jpg\t4c88284d...,军事,0,1,0,车牌号：南K.20128\t三年前的事情还拿出来炒，秀公知下线么\t,车牌号：南K.20128\t三年前的事情还拿出来炒，秀公知下线么\t[转发]//@王月琴33...


In [32]:
test_df.head(1)

Unnamed: 0,id,content,picture_lists,category,comment_2,comment_all
0,000408f5c5d17a3916f791ca340ce293,各路口24小时红灯，闯一次6分200元！通知，嘉鱼县城区内，于今日凌晨24后所有路口红绿灯，...,,疫情,不错😊\t,不错😊\t


In [33]:
# Fill nan (empty boxes) with 0
train_df = train_df.fillna({'content':'','picture_lists':'','category':'','ncw_label':0,'fake_label':0,'real_label':0,'comment_2c':'','comment_all':''})
test_df = test_df.fillna({'content':'','picture_lists':'','category':'','comment_2c':'','comment_all':''})

In [34]:
train = train_df.values
test = test_df.values

In [35]:
labels = {'train':[[train[i][4], train[i][5], train[i][6]] for i in range(len(train))]}
contents = {'train':[train[i][1] for i in range(len(train))], 'test':[test[i][1] for i in range(len(test))]}
contents_len = {'train':[len(train[i][1]) for i in range(len(train))], 'test':[len(test[i][1]) for i in range(len(test))]}
subjects = {'train':[train[i][3] for i in range(len(train))], 'test':[test[i][3] for i in range(len(test))]}
pics = {'train':[train[i][2] for i in range(len(train))], 'test':[test[i][2] for i in range(len(test))]}
pics_t = []
for i in range(len(train)):
    if train[i][2] != '':
        pics_t.append(len(train[i][2].strip('\t').split('\t')))
    else:
        pics_t.append(0)
pics_te = []
for i in range(len(test)):
    if test[i][2] != '':
        pics_te.append(len(test[i][2].strip('\t').split('\t')))
    else:
        pics_te.append(0)
picsnum = {'train':pics_t, 'test':pics_te}
comments_2 = {'train':[train[i][7] for i in range(len(train))], 'test':[test[i][4] for i in range(len(test))]}
comments_all = {'train':[train[i][8] for i in range(len(train))], 'test':[test[i][5] for i in range(len(test))]}
com_t = []
for i in range(len(train)):
    if train[i][8] != '':
        com_t.append(len(train[i][8].strip('\t').split('\t')))
    else:
        com_t.append(0)
com_te = []
for i in range(len(test)):
    if test[i][5] != '':
        com_te.append(len(test[i][5].strip('\t').split('\t')))
    else:
        com_te.append(0)
commentsnum = {'train':com_t, 'test':com_te}

In [36]:
for each in set(picsnum['test']):
    print(each, picsnum['test'].count(each))

0 2784
1 203
2 36
3 39
4 23
5 7
6 93
7 6
9 15


In [37]:
# pics_num
picsnum_f = {'train':[0]*len(train), 'test':[0]*len(test)}
for i in range(len(train)):
    num = picsnum['train'][i] / 10.0
    picsnum_f['train'][i] = [num for i in range(768)]

for i in range(len(test)):
    num = picsnum['test'][i] / 10.0
    picsnum_f['test'][i] = [num for i in range(768)]

In [38]:
temp = contents_len['train']
temp.sort()
print(temp[0],temp[-1],temp[int(len(train)/5 * 4)])

1 1994 146


In [39]:
# contents_len
contents_len_f = {'train':[0]*len(train), 'test':[0]*len(test)}
for i in range(len(train)):
    if contents_len['train'][i] < 200:
        num = contents_len['train'][i] / 200.0
        contents_len_f['train'][i] = [num for i in range(768)]
    else:
        contents_len_f['train'][i] = [0.9 for i in range(768)]

for i in range(len(test)):
    if contents_len['test'][i] < 200:
        num = contents_len['test'][i] / 200.0
        contents_len_f['test'][i] = [num for i in range(768)]
    else:
        contents_len_f['test'][i] = [0.9 for i in range(768)]

In [40]:
class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

In [55]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3): # Change number of labels here.
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('/mnt/bert_model/chinese_L-12_H-768_A-12')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        #self.fc1 = nn.Linear(config.hidden_size*2, 512)
        nn.init.xavier_normal_(self.classifier.weight)

    '''def forward_once(self, x):
        # Forward pass
        output = self.cnn1(x)
        output = output.view(output.size()[0], -1)
        output = self.fc1(output)
        return output'''

    def forward_once(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        #logits = self.classifier(pooled_output)

        return pooled_output

    def forward(self, input_ids1, pics_num, cont_len):
        # forward pass of input 1
        output1 = self.forward_once(input_ids1, token_type_ids=None, attention_mask=None, labels=None)
        
        #print(out.shape)

        # Multiply the credit score with the output after concatnation

        #out = torch.add(pics_num, output1)
        #out = torch.add(cont_len, out)

        #out = self.fc1(out)
        logits = self.classifier(output1)

        return logits

    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [56]:
config = BertConfig(vocab_size_or_config_json_file=22000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = BertForSequenceClassification(3)

INFO:pytorch_pretrained_bert.modeling:loading archive file /mnt/bert_model/chinese_L-12_H-768_A-12
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 21128
}



In [43]:
#X_train,X_val, y_train, y_val  = cross_validation.train_test_split(train_data,train_target,test_size=0.4, random_state=0)
X_train_all = [each.replace('\t','').replace('\u200b','').replace('#','').replace('@','') for each in contents['train']]
y_train_all = labels['train']
X_train_piscnum_all = picsnum_f['train']
X_train_contentslen_all = contents_len_f['train']

X_train = X_train_all[:int(len(train)*0.8)]
y_train = y_train_all[:int(len(train)*0.8)]
X_train_piscnum = X_train_piscnum_all[:int(len(train)*0.8)]
X_train_contentslen = X_train_contentslen_all[:int(len(train)*0.8)]

X_val = X_train_all[int(len(train)*0.8):]
y_val = y_train_all[int(len(train)*0.8):]
X_val_piscnum = X_train_piscnum_all[int(len(train)*0.8):]
X_val_contentslen = X_train_contentslen_all[int(len(train)*0.8):]

for i in range(len(contents['test'])):
    if contents['test'][i] == '':
        contents['test'][i] = 'None'
X_test = [each.replace('\t','').replace('\u200b','').replace('#','').replace('@','') for each in contents['test']]
X_test_piscnum = picsnum_f['test']
X_test_contentslen = contents_len_f['test']

max_seq_length_con = 256

In [44]:
X_train = X_train[:100]
y_train = y_train[:100]
X_train_piscnum = X_train_piscnum[:100]
X_train_contentslen = X_train_contentslen[:100]

X_val = X_val[:100]
y_val = y_val[:100]
X_val_piscnum = X_val_piscnum[:100]
X_val_contentslen = X_val_contentslen[:100]

In [63]:
batch_size_train = 8
batch_size_val = 12
batch_size_test = 12

# Train
train_lists = [X_train, X_train_piscnum, X_train_contentslen, y_train]

# Val
val_lists = [X_val, X_val_piscnum, X_val_contentslen, y_val]

# Test
test_lists = [X_test, X_test_piscnum, X_test_contentslen]

In [46]:
class text_dataset(Dataset):
    def __init__(self,x_y_list, transform=None):

        self.x_y_list = x_y_list
        self.transform = transform

    def __getitem__(self,index):

        # Tokenize statements
        tokenized_review = tokenizer.tokenize(self.x_y_list[0][index])

        if len(tokenized_review) > max_seq_length_con:
            tokenized_review = tokenized_review[:max_seq_length_con]

        ids_review  = tokenizer.convert_tokens_to_ids(tokenized_review)

        padding = [0] * (max_seq_length_con - len(ids_review))

        ids_review += padding

        assert len(ids_review) == max_seq_length_con

        #print(ids_review)
        ids_review = torch.tensor(ids_review)

        fakeness = self.x_y_list[3][index] # color
        list_of_labels = [torch.from_numpy(np.array(fakeness))]

        piscnum = self.x_y_list[1][index] # Credit score

        #ones_768 = np.ones((768))
        #credit_scr = credit_scr * ones_768
        piscnum = torch.tensor(piscnum)
        
        contentslen = self.x_y_list[2][index] # Credit score

        #ones_768 = np.ones((768))
        #credit_scr = credit_scr * ones_768
        contentslen = torch.tensor(contentslen)

        return [ids_review, piscnum, contentslen], list_of_labels[0]

    def __len__(self):
        return len(self.x_y_list[0])

In [47]:
class text_dataset_test(Dataset):
    def __init__(self,x_y_list, transform=None):

        self.x_y_list = x_y_list
        self.transform = transform

    def __getitem__(self,index):

        # Tokenize statements
        tokenized_review = tokenizer.tokenize(self.x_y_list[0][index])

        if len(tokenized_review) > max_seq_length_con:
            tokenized_review = tokenized_review[:max_seq_length_con]

        ids_review  = tokenizer.convert_tokens_to_ids(tokenized_review)

        padding = [0] * (max_seq_length_con - len(ids_review))

        ids_review += padding

        assert len(ids_review) == max_seq_length_con

        #print(ids_review)
        ids_review = torch.tensor(ids_review)

        piscnum = self.x_y_list[1][index] # Credit score

        #ones_768 = np.ones((768))
        #credit_scr = credit_scr * ones_768
        piscnum = torch.tensor(piscnum)
        
        contentslen = self.x_y_list[2][index] # Credit score

        #ones_768 = np.ones((768))
        #credit_scr = credit_scr * ones_768
        contentslen = torch.tensor(contentslen)

        return [ids_review, piscnum, contentslen]

    def __len__(self):
        return len(self.x_y_list[0])

In [64]:
# Preparing the data (Tokenize)
training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)
test_dataset = text_dataset_test(x_y_list = test_lists)


# Prepare the training dictionaries
dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size_train, shuffle=True, num_workers=0),
                    'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size_val, shuffle=False, num_workers=0),
                    'test':torch.utils.data.DataLoader(test_dataset, batch_size=batch_size_test, shuffle=False, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0]),
                'test':len(test_lists[0])}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [57]:
train_acc = []
val_acc = []
train_loss = []
val_loss = []

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100
    best_acc = 0

    for epoch in range(num_epochs):
        epoch_start = time.time()
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                #scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            fakeness_corrects = 0


            # Iterate over data.
            for inputs, fakeness in dataloaders_dict[phase]:

                inputs1 = inputs[0] # News statement input
                inputs2 = inputs[1] # Justification input
                inputs3 = inputs[2] # Meta data input

                inputs1 = inputs1.to(device)
                inputs2 = inputs2.to(device)
                inputs3 = inputs3.to(device)

                fakeness = fakeness.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    #print(inputs)
                    outputs = model(inputs1, inputs2, inputs3)

                    outputs = F.softmax(outputs,dim=1)

                    loss = criterion(outputs, torch.max(fakeness.float(), 1)[1])
                    # backward + optimize only if in training phase
                    if phase == 'train':

                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs1.size(0)


                fakeness_corrects += torch.sum(torch.max(outputs, 1)[1] == torch.max(fakeness, 1)[1])


            epoch_loss = running_loss / dataset_sizes[phase]


            fakeness_acc = fakeness_corrects.double() / dataset_sizes[phase]

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} fakeness_acc: {:.4f}'.format(
                phase, fakeness_acc))

            # Saving training acc and loss for each epoch
            if phase == 'train':
                fakeness_acc1 = fakeness_acc.data
                fakeness_acc1 = fakeness_acc1.cpu()
                fakeness_acc1 = fakeness_acc1.numpy()
                train_acc.append(fakeness_acc1)

                #epoch_loss1 = epoch_loss.data
                #epoch_loss1 = epoch_loss1.cpu()
                #epoch_loss1 = epoch_loss1.numpy()
                train_loss.append(epoch_loss)
                
                #print('labels\n', fakeness)
                #print('outputs\n', outputs)
                #print('torchmax\n', torch.max(outputs, 1)[1])

            else:
                # Saving val acc and loss for each epoch
                fakeness_acc1 = fakeness_acc.data
                fakeness_acc1 = fakeness_acc1.cpu()
                fakeness_acc1 = fakeness_acc1.numpy()
                val_acc.append(fakeness_acc1)

                #epoch_loss1 = epoch_loss.data
                #epoch_loss1 = epoch_loss1.cpu()
                #epoch_loss1 = epoch_loss1.numpy()
                val_loss.append(epoch_loss)
                
                #print('labels\n', fakeness)
                #print('outputs\n', outputs)
                #print('torchmax\n', torch.max(outputs, 1)[1])

                if fakeness_acc > best_acc:
                    print('Saving with accuracy of {}'.format(fakeness_acc),
                          'improved over previous {}'.format(best_acc))
                    best_acc = fakeness_acc
                    
                    best_model_wts = copy.deepcopy(model.state_dict())
                    torch.save(model.state_dict(), 'oneBERT_binary_focalloss.pth')

        print('Time taken for epoch'+ str(epoch+1)+ ' is ' + str((time.time() - epoch_start)/60) + ' minutes')
        print()
        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_acc)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, train_acc, val_acc, train_loss, val_loss

In [58]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [59]:
lrlast = .0001
lrmain = .00001
optim1 = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},

   ])

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)

# Observe that all parameters are being optimized
optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 3 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)

model_ft1, train_acc, val_acc, train_loss, val_loss = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,num_epochs=4)

starting
Epoch 1/4
----------
train total loss: 1.1521 
train fakeness_acc: 0.2800
val total loss: 1.1061 
val fakeness_acc: 0.4300
Saving with accuracy of 0.43 improved over previous 0
Time taken for epoch1 is 0.6261581381162008 minutes

Epoch 2/4
----------
train total loss: 1.0654 
train fakeness_acc: 0.4500
val total loss: 1.0714 
val fakeness_acc: 0.4800
Saving with accuracy of 0.48 improved over previous 0.43
Time taken for epoch2 is 0.6305120587348938 minutes

Epoch 3/4
----------
train total loss: 1.0062 
train fakeness_acc: 0.5400
val total loss: 1.0459 
val fakeness_acc: 0.4900
Saving with accuracy of 0.49 improved over previous 0.48
Time taken for epoch3 is 0.6366331775983175 minutes

Epoch 4/4
----------
train total loss: 0.9407 
train fakeness_acc: 0.6100
val total loss: 1.0344 
val fakeness_acc: 0.5100
Saving with accuracy of 0.51 improved over previous 0.49
Time taken for epoch4 is 0.6322222948074341 minutes

Training complete in 2m 32s
Best val Acc: 0.510000


In [None]:
model.to(device)
for param in model.parameters():
    param.requires_grad=False
model.eval()

valid_preds = []
fakeness_corrects = 0

for inputs in dataloaders_dict['test']:
    
    inputs1 = inputs[0] # News statement input
    inputs2 = inputs[1] # Justification input
    inputs3 = inputs[2] # Meta data input

    inputs1 = inputs1.to(device)
    inputs2 = inputs2.to(device)
    inputs3 = inputs3.to(device)

    # zero the parameter gradients
    #optimizer.zero_grad()
    
    #print(inputs)
    outputs = model(inputs1, inputs2, inputs3)

    outputs = F.softmax(outputs,dim=1)
    
    valid_preds.extend(outputs.data.cpu())