In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [3]:
#!pip install torchvision

In [4]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [5]:
from pytorch_transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [124]:
data_file_address = "reddit_train.csv"

In [125]:
import pandas as pd

data_file_address = "reddit_train.csv"

df_data = pd.read_csv(data_file_address,sep=",",encoding="utf-8",names=['texts','labels'])

In [126]:
df_data.columns

Index(['texts', 'labels'], dtype='object')

In [127]:
df_data = df_data.drop(df_data.index[0])

In [128]:
df_data.head()

Unnamed: 0,texts,labels
0,"Honestly, Buffalo is the correct answer. I rem...",hockey
1,Ah yes way could have been :( remember when he...,nba
2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends
3,He wouldn't have been a bad signing if we woul...,soccer
4,Easy. You use the piss and dry technique. Let ...,funny


In [129]:
df_data.labels.unique()

array(['hockey', 'nba', 'leagueoflegends', 'soccer', 'funny', 'movies',
       'anime', 'Overwatch', 'trees', 'GlobalOffensive', 'nfl',
       'AskReddit', 'gameofthrones', 'conspiracy', 'worldnews', 'wow',
       'europe', 'canada', 'Music', 'baseball'], dtype=object)

In [130]:
# Analyse the labels distribution
df_data.labels.value_counts()

conspiracy         3500
canada             3500
leagueoflegends    3500
trees              3500
movies             3500
Overwatch          3500
baseball           3500
Music              3500
gameofthrones      3500
AskReddit          3500
europe             3500
GlobalOffensive    3500
worldnews          3500
wow                3500
anime              3500
nfl                3500
soccer             3500
nba                3500
hockey             3500
funny              3500
Name: labels, dtype: int64

In [131]:
sentences = df_data.texts.to_list()
sentences[0]

'Honestly, Buffalo is the correct answer. I remember people (somewhat) joking that Buffalo\'s mantra for starting goalies was "win a game, get traded". \nI think Edmonton\'s front office was a travesty for the better part of 10 years, but Buffalo\'s systematic destruction of the term \'competitive\' was much more responsible for the change to the draft lottery. '

In [132]:
# Get tag labels data
labels = df_data.labels.to_list()
print(labels[0])

hockey


In [133]:
tag2idx={'hockey':0, 'nba':1, 'leagueoflegends':2, 'soccer':3, 'funny':4, 'movies':5,
       'anime':6, 'Overwatch':7, 'trees':8, 'GlobalOffensive':9, 'nfl':10,
       'AskReddit':11, 'gameofthrones':12, 'conspiracy':13, 'worldnews':14, 'wow':15,
       'europe':16, 'canada':17, 'Music':18, 'baseball': 19}

In [134]:
tag2idx

{'hockey': 0,
 'nba': 1,
 'leagueoflegends': 2,
 'soccer': 3,
 'funny': 4,
 'movies': 5,
 'anime': 6,
 'Overwatch': 7,
 'trees': 8,
 'GlobalOffensive': 9,
 'nfl': 10,
 'AskReddit': 11,
 'gameofthrones': 12,
 'conspiracy': 13,
 'worldnews': 14,
 'wow': 15,
 'europe': 16,
 'canada': 17,
 'Music': 18,
 'baseball': 19}

In [135]:
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [136]:
tag2name

{0: 'hockey',
 1: 'nba',
 2: 'leagueoflegends',
 3: 'soccer',
 4: 'funny',
 5: 'movies',
 6: 'anime',
 7: 'Overwatch',
 8: 'trees',
 9: 'GlobalOffensive',
 10: 'nfl',
 11: 'AskReddit',
 12: 'gameofthrones',
 13: 'conspiracy',
 14: 'worldnews',
 15: 'wow',
 16: 'europe',
 17: 'canada',
 18: 'Music',
 19: 'baseball'}

In [137]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [138]:
n_gpu

1

In [139]:
vocabulary = 'xlnet-base-cased-spiece.model'

In [140]:
max_len  = 64

In [141]:
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

In [142]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: Honestly, Buffalo is the correct answer. I remember people (somewhat) joking that Buffalo's mantra for starting goalies was "win a game, get traded". 
I think Edmonton's front office was a travesty for the better part of 10 years, but Buffalo's systematic destruction of the term 'competitive' was much more responsible for the change to the draft lottery. 
input_ids:[17, 22709, 111, 19, 8390, 27, 18, 2900, 1543, 9, 35, 1633, 104, 17, 10, 4202, 8337, 11, 20888, 29, 8390, 26, 23, 326, 2044, 28, 1541, 19152, 23, 30, 17, 12, 3305, 24, 275, 19, 133, 4917, 12, 9, 35, 232, 16750, 26, 23, 605, 495, 30, 24, 17, 2044, 189, 23, 982, 28, 18, 352, 188, 20, 241, 123, 19, 4, 3]
attention_masks:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [143]:
tags = [tag2idx[str(lab)] for lab in labels]
print(tags[0])

0


In [144]:
#pt_tensor_from_list = torch.FloatTensor(full_input_ids)

In [145]:
type(pt_tensor_from_list)

torch.Tensor

In [146]:
type(tr_inputs)

torch.Tensor

In [150]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, 
                                                            random_state=4, test_size=3/7)

In [151]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(40000, 30000, 40000, 30000)

In [152]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

In [205]:
val_inputs

tensor([[    0,     0,     0,  ...,     9,     4,     3],
        [ 1401, 10468,    51,  ...,  8911,     4,     3],
        [    0,     0,     0,  ...,  5564,     4,     3],
        ...,
        [    0,     0,     0,  ...,    82,     4,     3],
        [    0,     0,     0,  ..., 16155,     4,     3],
        [    0,     0,     0,  ...,     9,     4,     3]])

In [30]:
# Set batch num
batch_num = 16

In [31]:
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

In [32]:
# In this document, contain confg(txt) and weight(bin) files
model_file_address = 'xlnet-base-cased'

In [33]:
model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

In [34]:
model;

In [35]:
model.to(device);

In [36]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [37]:
epochs = 5
max_grad_norm = 1.0

In [38]:
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

In [39]:
FULL_FINETUNING = True

In [40]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [41]:
# TRAIN loop
model.train();

In [42]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_segs,b_labels = batch
        
        # forward pass
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        loss, logits = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))

***** Running training *****
  Num examples = 59500
  Batch size = 16
  Num steps = 18595


Epoch:   0%|                                                                                     | 0/5 [00:00<?, ?it/s]

Train loss: 1.7401271991956742


Epoch:  20%|██████████████▌                                                          | 1/5 [37:25<2:29:42, 2245.73s/it]

Train loss: 1.331056315285535


Epoch:  40%|████████████████████████████▍                                          | 2/5 [1:14:52<1:52:18, 2246.18s/it]

Train loss: 1.069198313228798


Epoch:  60%|██████████████████████████████████████████▌                            | 3/5 [1:52:23<1:14:55, 2247.56s/it]

Train loss: 0.8505406287038371


Epoch:  80%|██████████████████████████████████████████████████████████▍              | 4/5 [2:29:54<37:28, 2248.54s/it]

Train loss: 0.6581310052073983


Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 5/5 [3:07:21<00:00, 2247.94s/it]


In [43]:
xlnet_out_address = 'tc02'

# Make dir if not exits
if not os.path.exists(xlnet_out_address):
        os.makedirs(xlnet_out_address)
        
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [44]:

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(xlnet_out_address, "pytorch_model.bin")
output_config_file = os.path.join(xlnet_out_address, "config.json")


# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(xlnet_out_address)

('tc02\\spiece.model',)

In [45]:
model = XLNetForSequenceClassification.from_pretrained(xlnet_out_address,num_labels=len(tag2idx))


In [46]:
model.to(device);

In [47]:

if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [48]:
model.eval();

# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [166]:
len(val_inputs)

30000

In [49]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
#     print(tmp_eval_accuracy)
#     print(np.argmax(logits, axis=1))
#     print(label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss/nb_tr_steps 
result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file
output_eval_file = os.path.join(xlnet_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))
        
    print(report)
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =10500
  Batch size = 16
***** Eval results *****
  eval_accuracy = 0.5774285714285714
  eval_loss = 1.7479341010388718
  loss = 0.6581310052073983
              precision    recall  f1-score   support

           0       0.56      0.69      0.62       530
           1       0.83      0.54      0.65       567
           2       0.57      0.66      0.61       516
           3       0.62      0.68      0.65       551
           4       0.29      0.29      0.29       536
           5       0.54      0.61      0.57       504
           6       0.65      0.68      0.67       515
           7       0.66      0.66      0.66       516
           8       0.65      0.58      0.61       502
           9       0.75      0.57      0.65       513
          10       0.63      0.65      0.64       534
          11       0.32      0.47      0.38       547
          12       0.68      0.77      0.72       509
          13       0.49      0.39      0.43      

In [177]:
data_file_address = "reddit_test.csv"
df_data = pd.read_csv(data_file_address,encoding="utf-8",names=['id','texts'])

In [178]:
df_data = df_data.drop(df_data.index[0])

In [179]:
df_data.head()

Unnamed: 0,id,texts
1,0,Trout and Bryant have both led the league in s...
2,1,&gt; Just like Estonians have good reasons to ...
3,2,Will Sol_Primeval sotp being oblivious?\n\nfin...
4,3,Moving Ostwald borders back to the pre 1967 bo...
5,4,"You have to take it out of the bag, Morty!"


In [180]:
sentences = df_data.texts.to_list()

In [181]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: Trout and Bryant have both led the league in strikeouts before. Trout's actually on pace to crush Reggie's record. Judge will be fine. This is also his rookie year.
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5633, 2015, 21, 11080, 47, 207, 687, 18, 1594, 25, 25349, 134, 9, 5633, 2015, 26, 23, 995, 31, 3268, 22, 13396, 24406, 26, 23, 598, 9, 4424, 53, 39, 1592, 9, 122, 27, 77, 45, 9011, 119, 9, 4, 3]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:1
sentence: &gt; Just like Estonians have good reasons to fear Russian expansionism, so the southern nations have good reaso

In [220]:
k_input_ids=torch.LongTensor(full_input_ids)
k_input_masks=torch.LongTensor(full_input_masks)
k_segment_ids=torch.LongTensor(full_segment_ids)

In [221]:
k_input_ids

tensor([[    0,     0,     0,  ...,     9,     4,     3],
        [    0,     0,     0,  ...,     9,     4,     3],
        [    0,     0,     0,  ...,  2842,     4,     3],
        ...,
        [ 2591, 10234,    39,  ...,   220,     4,     3],
        [24012,  7345,    56,  ...,   406,     4,     3],
        [    0,     0,     0,  ...,   512,     4,     3]])

In [222]:
valid_data = TensorDataset(k_input_ids, k_input_masks,k_segment_ids, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

In [223]:
valid_data

<torch.utils.data.dataset.TensorDataset at 0x24d120bacc0>

In [224]:
full_input_ids

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5633,
  2015,
  21,
  11080,
  47,
  207,
  687,
  18,
  1594,
  25,
  25349,
  134,
  9,
  5633,
  2015,
  26,
  23,
  995,
  31,
  3268,
  22,
  13396,
  24406,
  26,
  23,
  598,
  9,
  4424,
  53,
  39,
  1592,
  9,
  122,
  27,
  77,
  45,
  9011,
  119,
  9,
  4,
  3],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1123,
  299,
  46,
  97,
  1641,
  115,
  21627,
  23,
  47,
  195,
  2113,
  22,
  1819,
  965,
  3421,
  949,
  19,
  102,
  18,
  1335,
  2158,
  47,
  195,
  2113,
  22,
  1819,
  1950,
  3421,
  949,
  9,
  201,
  26,
  88,
  17,
  93,
  23805,
  56,
  106,
  4331,
  29,
  1525,
  5007,
  526,
  19,
  22,
  359,
  29,
  1348,
  1514,
  123,
  526,
  9,
  4,
  3],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [225]:
k_segment_ids

tensor([[4, 4, 4,  ..., 0, 0, 2],
        [4, 4, 4,  ..., 0, 0, 2],
        [4, 4, 4,  ..., 0, 0, 2],
        ...,
        [0, 0, 0,  ..., 0, 0, 2],
        [0, 0, 0,  ..., 0, 0, 2],
        [4, 4, 4,  ..., 0, 0, 2]])

In [226]:
y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    m_input_ids, m_input_mask, m_segs,m_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =m_input_ids,token_type_ids=m_segs, input_mask = m_input_mask,labels=m_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
#     print(tmp_eval_accuracy)
#     print(np.argmax(logits, axis=1))
#     print(label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        

***** Running evaluation *****
  Num examples =30000
  Batch size = 16


In [227]:
print(y_predict)

[19, 16, 6, 14, 12, 0, 0, 6, 3, 16, 8, 12, 18, 17, 17, 14, 18, 13, 19, 17, 5, 14, 12, 19, 15, 11, 19, 12, 8, 13, 10, 9, 1, 14, 12, 12, 2, 2, 1, 12, 0, 2, 16, 0, 12, 9, 4, 15, 12, 11, 2, 6, 10, 5, 7, 8, 18, 13, 16, 2, 8, 19, 18, 4, 16, 7, 1, 3, 4, 9, 2, 14, 12, 15, 17, 4, 16, 0, 12, 6, 14, 16, 9, 8, 17, 10, 6, 13, 0, 19, 13, 18, 17, 12, 6, 10, 17, 8, 4, 7, 8, 8, 19, 15, 8, 4, 8, 11, 9, 16, 5, 19, 14, 5, 11, 18, 9, 6, 13, 3, 0, 14, 10, 16, 16, 15, 15, 11, 2, 18, 19, 0, 3, 2, 18, 18, 13, 0, 11, 3, 12, 16, 3, 4, 3, 14, 0, 9, 12, 16, 3, 18, 3, 2, 2, 17, 12, 5, 11, 7, 6, 15, 11, 4, 13, 12, 15, 11, 18, 9, 4, 19, 15, 17, 7, 19, 12, 12, 5, 1, 0, 15, 2, 10, 2, 3, 4, 17, 16, 4, 17, 10, 2, 18, 16, 5, 2, 9, 12, 11, 3, 4, 10, 11, 2, 10, 7, 8, 12, 19, 18, 4, 17, 2, 3, 10, 16, 19, 5, 0, 11, 14, 18, 11, 8, 6, 6, 8, 5, 15, 5, 18, 14, 12, 11, 9, 10, 0, 11, 19, 1, 3, 2, 5, 15, 2, 4, 10, 11, 14, 13, 11, 1, 5, 3, 18, 12, 11, 1, 4, 1, 7, 11, 15, 18, 18, 12, 6, 11, 19, 18, 3, 19, 2, 12, 16, 4, 12, 2, 10, 15, 




In [228]:
results = [tag2name[lab] for lab in y_predict]

In [229]:
results

['baseball',
 'europe',
 'anime',
 'worldnews',
 'gameofthrones',
 'hockey',
 'hockey',
 'anime',
 'soccer',
 'europe',
 'trees',
 'gameofthrones',
 'Music',
 'canada',
 'canada',
 'worldnews',
 'Music',
 'conspiracy',
 'baseball',
 'canada',
 'movies',
 'worldnews',
 'gameofthrones',
 'baseball',
 'wow',
 'AskReddit',
 'baseball',
 'gameofthrones',
 'trees',
 'conspiracy',
 'nfl',
 'GlobalOffensive',
 'nba',
 'worldnews',
 'gameofthrones',
 'gameofthrones',
 'leagueoflegends',
 'leagueoflegends',
 'nba',
 'gameofthrones',
 'hockey',
 'leagueoflegends',
 'europe',
 'hockey',
 'gameofthrones',
 'GlobalOffensive',
 'funny',
 'wow',
 'gameofthrones',
 'AskReddit',
 'leagueoflegends',
 'anime',
 'nfl',
 'movies',
 'Overwatch',
 'trees',
 'Music',
 'conspiracy',
 'europe',
 'leagueoflegends',
 'trees',
 'baseball',
 'Music',
 'funny',
 'europe',
 'Overwatch',
 'nba',
 'soccer',
 'funny',
 'GlobalOffensive',
 'leagueoflegends',
 'worldnews',
 'gameofthrones',
 'wow',
 'canada',
 'funny',
 'e

In [231]:
df = pd.DataFrame(results)
df.to_csv(r"C:\Users\k_mathin\Desktop\results/final_result.csv", header=['Category'])
