Install environment

In [1]:
!pip install transformers
!pip install sentencepiece


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 63.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[

Upload data to colab

In [2]:
# after shift+enter, you have to click choose file,choose the 'tr.csv'
from google.colab import files
uploaded = files.upload()


Saving tr.csv to tr.csv


Use updated data

In [1]:
import numpy as np
with open('i172_7000_vs3_label.txt','r') as f:
  claim = f.readlines()
labels = [np.int(t.strip()) for t in claim]

with open('i172_7000_vs1_text.txt','r') as f:
  texts = f.readlines()
texts = [t.strip() for t in texts]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


Read data

In [10]:
import pandas as pd
df = pd.read_csv("tr.csv")

In [None]:
# pick 7000 data as training data
texts = df.irsen_text.values.tolist()[:7000]
labels = df.claim_s.values.tolist()[:7000]
# 1 of the evidence has wrong label, delete. Now we have 6999 training data
# labels = labels[:4487] + labels[4488:7000]
# texts = texts[:4487] + texts[4488:7000]
len(texts)

7000

Using GPU

In [2]:
import numpy as np
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

data prepare

In [3]:
# split data to training and validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score
from sklearn.metrics import confusion_matrix
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, random_state=2, test_size=0.1)

In [4]:
# load tokenizer, turn data to bert type token
from transformers import BertTokenizerFast,RobertaTokenizer
from transformers import XLNetTokenizer, XLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
# turn data to torch dataset
class bertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = bertDataset(train_encodings, train_labels)
val_dataset = bertDataset(val_encodings, val_labels)

Prepare model

In [6]:
# training metrics, will show result during training
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [7]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch import nn

# set some parameter
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    learning_rate=2e-05,
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_steps=100,
    evaluation_strategy="steps",
    load_best_model_at_end=True
)
# using XLnet classification model
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")
# using bert classification model
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model.to(device) # put model to gpu
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,# evaluation dataset
    compute_metrics=compute_metrics,             
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],

    )


trainer.train()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.434,0.46744,0.86,0.0,0.0,0.0
200,0.345,0.193675,0.932857,0.84,0.642857,0.728324
300,0.2229,0.216816,0.957143,0.904762,0.77551,0.835165
400,0.2278,0.377784,0.922857,0.650685,0.969388,0.778689
500,0.16,0.130461,0.972857,0.87619,0.938776,0.906404
600,0.2307,0.116769,0.967143,0.844037,0.938776,0.888889
700,0.1587,0.111676,0.977143,0.93617,0.897959,0.916667
800,0.1653,0.084974,0.977143,0.945652,0.887755,0.915789
900,0.1544,0.109907,0.965714,0.813559,0.979592,0.888889
1000,0.0656,0.118698,0.974286,0.944444,0.867347,0.904255


***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
***** Running Evaluation *****
  Num examples = 700
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1000
Configuration 

TrainOutput(global_step=1576, training_loss=0.1699102254688437, metrics={'train_runtime': 2253.867, 'train_samples_per_second': 5.59, 'train_steps_per_second': 0.699, 'total_flos': 4781314879387200.0, 'train_loss': 0.1699102254688437, 'epoch': 2.0})

Save model

In [16]:
output_dir='./model'
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/spiece.model',
 './model/added_tokens.json')

download model

In [17]:
from google.colab import files
!zip -r /content/model.zip /content/model
files.download('model.zip')

  adding: content/model/ (stored 0%)
  adding: content/model/tokenizer_config.json (deflated 50%)
  adding: content/model/special_tokens_map.json (deflated 52%)
  adding: content/model/pytorch_model.bin (deflated 7%)
  adding: content/model/spiece.model (deflated 49%)
  adding: content/model/config.json (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predict 

In [8]:
model.eval()
import torch.nn.functional as F
#this will predict one sentence each time
def predict(content):

    inputs = tokenizer(content,
                       
                       padding='max_length',
                       truncation=True, return_tensors="pt")
    # move to gpu
    ids = inputs["input_ids"].to(device)
    idt = inputs["token_type_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = model(ids,token_type_ids=idt,attention_mask=mask)
    logits = outputs[0]
    x = F.softmax(logits, dim=-1)
    active_logits = logits.view(-1, model.num_labels)  # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits,
                                         axis=1)
    return x.cpu().detach().numpy()[0][1], flattened_predictions.cpu().numpy()[0]

prepare test data

In [11]:
sentences = df.irsen_text.values.tolist()[7000:]
real = df.claim_s.values.tolist()[7000:]
len(sentences)

589

In [12]:
with open('i172_589_labels.txt','r') as f:
  real = f.readlines()
real = [int(t.strip()) for t in real]

In [13]:
pre = []# predict label
pre_pro = []# predict probility
for i in sentences:
  x = predict(i)
  pre.append(x[1])
  pre_pro.append(x[0])

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


calculate false negative and false postive

In [None]:
xl_tf = []
for i in range(len(real)):
  if pre[i] == 0 and real[i] == 0:
    xl_tf.append('TN')
  elif pre[i] == 1 and real[i] == 1:
    xl_tf.append('TP')
  elif pre[i] == 1 and real[i] == 0:
    xl_tf.append('FP')
  elif pre[i] == 0 and real[i] == 1:
    xl_tf.append('FN')

In [None]:
with open('xl_tf.txt','w') as f:
  f.writelines([str(p)+'\n' for p in xl_tf])

Show result

In [14]:
print('f1:'+str(f1_score(real, pre, average=None))+'\n'+'recall:'+str(recall_score(real, pre, average=None))+'\n'+'precision:'+str(precision_score(real, pre, average=None))+'\n'+'accuracy:'+str(accuracy_score(real, pre))+'\n')

f1:[0.98532495 0.9375    ]
recall:[0.98121086 0.95454545]
precision:[0.98947368 0.92105263]
accuracy:0.9762308998302207



In [15]:
confusion_matrix(real, pre)

array([[470,   9],
       [  5, 105]])

In [33]:
with open('i172pre_xl.txt','w') as f:
  f.writelines([str(p)+'\n' for p in pre])
with open('i172pro_xl.txt','w') as f:
  f.writelines([str(p)+'\n' for p in pre_pro])

Show probability graph

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots(1,1,figsize = (10,5))
ax.hist(np.array(pre_pro)[np.array(real) == 1], color = "darkred",bins = "scott", alpha = .5, edgecolor = "red")
ax.hist(np.array(pre_pro)[np.array(real) == 0], color = "darkgreen",bins = "scott", alpha = .5, edgecolor = "green")

In [None]:
# number of samples model gives probabilty more than .8 but real label are non-evidence
print('num of samples have score more than 0.8 but are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.8)))
print('num of samples have score more than 0.8 are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.8)))
# how much confidence if the socore is higher than 0.8, we are 85.7% confident that the sentence is evidence if the score is higher than .8
confi80 = str(round((np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.8))/(np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.8) + np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.8)),4))
print('we are '+ confi80 +' confident that the sentence is evidence if the score is higher than .8')
# number of samples model gives probabilty more than .6 less than .8 but real label is non-evidence
num6080_nevid = np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.6) - np.sum(np.array(pre_pro)[np.array(real) == 0]>=0.80)
num6080_evid = np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.6) - np.sum(np.array(pre_pro)[np.array(real) == 1]>=0.80)
print('num of samples have score more than 0.6 and less than 0.8 but are non-evidence :'+ str(num6080_nevid))
print('num of samples have score more than 0.6 and less than 0.8 are evidence :'+ str(num6080_evid))
confi6080 = num6080_evid/(num6080_evid + num6080_nevid)
print('we are '+ str(round(confi6080,4)) +' confident that the sentence is evidence if the score is higher than .6 and less than .8')

In [None]:
print('num of samples have score more than 0.8 but are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2)))
print('num of samples have score more than 0.8 are non-evidence : '+ str(np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.2)))
confi20 = str(round((np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2))/(np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2) + np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.2)),4))
print('we are '+ confi80 +' confident that the sentence is non-evidence if the score is higher than .2')
num2040_nevid = np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.4) - np.sum(np.array(pre_pro)[np.array(real) == 0]<=0.2)
num2040_evid = np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.4) - np.sum(np.array(pre_pro)[np.array(real) == 1]<=0.2)
print('num of samples have score more than 0.2 and less than 0.4 but are non-evidence :'+ str(num2040_nevid))
print('num of samples have score more than 0.2 and less than 0.4 are evidence :'+ str(num2040_evid))
confi2040 = num2040_nevid/(num2040_evid + num2040_nevid)
print('we are '+ str(round(confi2040,4)) +' confident that the sentence is non-evidence if the score is higher than .2 and less than .4')

In [18]:
# print the wrong predictions. The end of sentence shows the real labels
for i in range(len(real)):
  if real[i] != pre[i]:
    print(sentences[i] + ' claim' if real[i] else sentences[i] + ' noclaim')

But the biggest effect had tooked place was draining the swamp. noclaim
But the pythons was a big problem. noclaim
the Burmese pythons were a threat to the everglades because they were eating all the small mammals. noclaim
The other threat to the Everglades is The Burmese python. noclaim
In my opinion an entire swamp is harder to recover from compared to a couple hundred snakes that eventually you will probably be able to hunt. claim
So thats my opinion of which one is a bigger threat. noclaim
In this 5 paragraph essay i will show you why the Burmese python is the biggest threat to the everglades. noclaim
The Burmese is a big threat to the animals that live in the everglades. claim
Another reason, the biggest threat to the Everglades is the Burmese python is they eat little animals. claim
This is why the Burmese Python is a bigger issue in the everglades than having to drain the Everglades. claim
That was the biggest threat. noclaim
The Burmese pythons are a big threat because they can