In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0
Looking in indexes: https://pypi.org/simple, https://u

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
import torch
import time
from datetime import timedelta
import numpy as np
import pandas as pd
import transformers
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import DataLoader
from torch import optim
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [4]:

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from torch.utils.data import Dataset
import re


class TextData(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


def evaluate(model, test_loader):
    """
    Evaluate the model performance on test_loader.
    """
    model = model.to(device)
    start_time = time.time()
    with torch.no_grad():
        y_true = torch.Tensor([])
        y_hat = torch.Tensor([])
        for batch_data_test, batch_label_test in test_loader:
            batch_outputs = model(batch_data_test)
            batch_y_hat = torch.argmax(batch_outputs, dim=1)
            y_true = torch.concat([y_true, batch_label_test]).int()
            y_hat = torch.concat([y_hat, batch_y_hat]).int()
        acc = sum(y_true == y_hat) / len(y_hat)
    end_time = time.time()
    return acc, end_time-start_time


def preprocess(sentence: str):
    """
    The first preprocessing step is removing all punctuation and digits to removing useless information.
    The second preprocessing step is transform all words to lower case, since the case information is less useful in the doc2bow method.
    The third step is tokenizing the sentence to prepare for further preprocessing in word level.
    The fourth step is removing stopwords, preposition and subordinating conjunction (IN), cardinal number (CD), modal (MD) words.
    The fifth step is the lemmatization of the word to align the form of same words.
    """
    sentence = re.sub(r'[^a-zA-Z_\s]', '', sentence)  # remove all punctuation and digits
    sentence = sentence.lower()  # lower the case
    tokens = word_tokenize(sentence)  # tokenize the sentence
    tagged = pos_tag(tokens)
    tokenized = [token for token, pos in tagged if token.isalpha() and pos not in {"IN", "CD", "MD"} and token not in stopwords.words('english')]
    lemma = WordNetLemmatizer()
    normalized = [lemma.lemmatize(word) for word in tokenized]
    return ' '.join(normalized)


def concatenate_title_and_description(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    In the original dataset, the news title and description are in two different columns.
    And I am going to concatenate them together to form a news text for later processing.
    """
    df_tmp = dataframe.copy()
    df_tmp['text'] = df_tmp['Title'] + df_tmp['Description']
    df_tmp.drop(columns=['Title', 'Description'], inplace=True)
    df_tmp.rename(columns={'Class Index': 'label'}, inplace=True)
    return df_tmp


def sample_data(data, num):
    idx = np.random.choice(range(len(data)), num, replace=False)
    return data.iloc[idx, :].reset_index(drop=True)


In [5]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert: bool):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.linear = nn.Linear(768, 4)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_dict: transformers.tokenization_utils_base.BatchEncoding):
        bert_output = self.bert(**input_dict)
        linear_output = self.linear(bert_output['pooler_output'])
        softmax_output = self.softmax(linear_output)
        return softmax_output


def extract_feature(txt: str) -> np.array:
    """
    Using pretrained bert model to extract the feature of news text as a 768-length vector.
    """
    encoded_input = tokenizer(preprocess(txt), return_tensors='pt')
    output = model(**encoded_input)
    last_cls = output['last_hidden_state'].detach().numpy().squeeze()[0, :]
    return last_cls


def extract_feature2(preprocess: bool):
    if preprocess:
        train_loader = DataLoader(TextData(train_data2['text'].apply(preprocess), train_data2['label'] - 1), batch_size=64, shuffle=True)
        test_loader = DataLoader(TextData(test_data2['text'].apply(preprocess), test_data2['label'] - 1), batch_size=64, shuffle=False)
    else:
        train_loader = DataLoader(TextData(train_data2['text'], train_data2['label'] - 1), batch_size=64, shuffle=True)
        test_loader = DataLoader(TextData(test_data2['text'], test_data2['label'] - 1), batch_size=64, shuffle=False)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', bos_token="[CLS]", eos_token="[SEP]")
    model = BertModel.from_pretrained("bert-base-uncased").to('cuda')
    
    features = []
    start_time = time.time()
    for batch_data_train, batch_label_train in train_loader:
        encoded_input = tokenizer(batch_data_train, max_length=60, padding='max_length', truncation=True, return_tensors='pt')
        batch_outputs = model(encoded_input.to('cuda')).to('cpu')
        features.append(batch_outputs)
    end_time = time.time()




def train(tokenizer, model, train_loader, loss_fn, optimizer):
    """
    Train model on train_loader with 1 global epoch batch by batch.
    """
    model = model.to('cuda')
    start_time = time.time()
    for batch_data_train, batch_label_train in train_loader:
        encoded_input = tokenizer(batch_data_train, max_length=60, padding='max_length', truncation=True, return_tensors='pt')
        outputs = model(encoded_input.to('cuda')).to('cpu')
        loss = loss_fn(outputs, batch_label_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    end_time = time.time()
    return model, end_time-start_time


def evaluate(tokenizer, bert_classifier, test_loader):
    """
    Evaluate the model performance on test_loader.
    """
    bert_classifier = bert_classifier.to('cuda')
    start_time = time.time()
    with torch.no_grad():
        y_true = torch.Tensor([])
        y_hat = torch.Tensor([])
        for batch_data_test, batch_label_test in test_loader:
            encoded_input = tokenizer(batch_data_test, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
            batch_outputs = bert_classifier(encoded_input.to('cuda')).to('cpu')
            batch_y_hat = torch.argmax(batch_outputs, dim=1)
            y_true = torch.concat([y_true, batch_label_test]).int()
            y_hat = torch.concat([y_hat, batch_y_hat]).int()
        acc = sum(y_true == y_hat) / len(y_hat)
    end_time = time.time()
    return acc, end_time - start_time


def train_evaluate(epoch_num, freeze_bert):
    train_loader = DataLoader(TextData(train_data2['text'], train_data2['label'] - 1), batch_size=64, shuffle=True)
    test_loader = DataLoader(TextData(test_data2['text'], test_data2['label'] - 1), batch_size=64, shuffle=False)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', bos_token="[CLS]", eos_token="[SEP]")
    bert_classifier = BertClassifier(freeze_bert=freeze_bert)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(bert_classifier.parameters(), lr=0.01)

    total_train_time = 0
    total_evaluate_time = 0
    test_acc_ = []
    for epoch in range(epoch_num):
        bert_classifier, train_time = train(tokenizer, bert_classifier, train_loader, loss_fn, optimizer)
        test_acc, evaluate_time = evaluate(tokenizer, bert_classifier, test_loader)

        total_train_time += train_time
        total_evaluate_time += evaluate_time

        test_acc_.append(float(test_acc))

        print(f"Epoch: {epoch}\t|\tTest Accuracy: {test_acc * 100:.0f}%\t|\t"
              f"TrainTime: {timedelta(seconds=int(total_train_time))}\t|\t"
              f"EvaluateTime: {timedelta(seconds=int(total_evaluate_time))}")

In [6]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

train_data2 = sample_data(concatenate_title_and_description(train_data), 5000)
test_data2 = sample_data(concatenate_title_and_description(test_data), 1000)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', bos_token="[CLS]", eos_token="[SEP]")
model = BertModel.from_pretrained("bert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_evaluate(30, freeze_bert=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 0	|	Test Accuracy: 24%	|	TrainTime: 0:00:48	|	EvaluateTime: 0:00:07
Epoch: 1	|	Test Accuracy: 24%	|	TrainTime: 0:01:35	|	EvaluateTime: 0:00:16
Epoch: 2	|	Test Accuracy: 24%	|	TrainTime: 0:02:20	|	EvaluateTime: 0:00:24
Epoch: 3	|	Test Accuracy: 24%	|	TrainTime: 0:03:06	|	EvaluateTime: 0:00:31
Epoch: 4	|	Test Accuracy: 24%	|	TrainTime: 0:03:51	|	EvaluateTime: 0:00:39
Epoch: 5	|	Test Accuracy: 24%	|	TrainTime: 0:04:36	|	EvaluateTime: 0:00:46
Epoch: 6	|	Test Accuracy: 24%	|	TrainTime: 0:05:22	|	EvaluateTime: 0:00:53
Epoch: 7	|	Test Accuracy: 24%	|	TrainTime: 0:06:07	|	EvaluateTime: 0:01:00
Epoch: 8	|	Test Accuracy: 24%	|	TrainTime: 0:06:53	|	EvaluateTime: 0:01:07
Epoch: 9	|	Test Accuracy: 24%	|	TrainTime: 0:07:38	|	EvaluateTime: 0:01:15
Epoch: 10	|	Test Accuracy: 24%	|	TrainTime: 0:08:23	|	EvaluateTime: 0:01:22
Epoch: 11	|	Test Accuracy: 24%	|	TrainTime: 0:09:09	|	EvaluateTime: 0:01:29
Epoch: 12	|	Test Accuracy: 24%	|	TrainTime: 0:09:54	|	EvaluateTime: 0:01:36
Epoch: 13	|	Test Accur

In [None]:
train_evaluate(30, freeze_bert=True)

In [19]:

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import datasets
from datasets import Dataset, DatasetDict
from datasets import load_metric
metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=15, padding="max_length", truncation=True, return_tensors='pt')


TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
train_data2 = sample_data(concatenate_title_and_description(train_data), 5000)
test_data2 = sample_data(concatenate_title_and_description(test_data), 1000)

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", bos_token="[CLS]", eos_token="[SEP]")

train_dataset = Dataset.from_dict({'text': list(train_data2['text'].values), 'label': list(train_data2['label'].values-1)})
test_dataset = Dataset.from_dict({'text': list(test_data2['text'].values), 'label': list(test_data2['label'].values-1)})
ag_news = DatasetDict({'train': train_dataset, 'test': test_dataset})
tokenized_datasets = ag_news.map(tokenize_function, batched=True)

training_args = TrainingArguments("test_trainer", per_device_eval_batch_size=1024,
                                  learning_rate=0.0001, num_train_epochs=30,
                                  evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=4)

trainer = Trainer(model=model, args=training_args, 
                  train_dataset=tokenized_datasets['train'], 
                  eval_dataset=tokenized_datasets['test'],
                  compute_metrics=compute_metrics)
start_time = time.time()
trainer.train()
print("Train time: ", timedelta(seconds=int(time.time() - start_time)))

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9085,0.672647,0.79
2,0.7347,1.21229,0.321
3,0.8965,0.904047,0.733
4,0.9863,0.976347,0.603
5,1.168,1.127075,0.45
6,1.2563,1.381038,0.245
7,1.4006,1.400265,0.247
8,1.4082,1.401182,0.271
9,1.4089,1.390084,0.271
10,1.4049,1.388483,0.237


Train time:  0:40:00


In [7]:
def extract_feature2(preprocess_bool: bool):
    if preprocess_bool:
        train_text_ = train_data2['text'].apply(preprocess)
        test_text_ = test_data2['text'].apply(preprocess)
    else:
        train_text_ = train_data2['text']
        test_text_ = test_data2['text']
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', bos_token="[CLS]", eos_token="[SEP]")
    model = BertModel.from_pretrained("bert-base-uncased").to(device)
    
    def extract(text_):
        features = []
        start_time = time.time()
        for text in text_:
            encoded_input = tokenizer(text, max_length=20, padding='max_length', truncation=True, return_tensors='pt')
            batch_outputs = model(**encoded_input.to(device))['pooler_output'].to('cpu').detach().numpy()
            features.append(batch_outputs)
        end_time = time.time()
        result, time_consumption = np.concatenate(features, axis=0), end_time - start_time
        return result, time_consumption
    
    return extract(train_text_), extract(test_text_)

In [8]:
# (train_result, train_time_consumption), (test_result, test_time_consumption) = extract_feature2(False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', bos_token="[CLS]", eos_token="[SEP]")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
features = []
start_time = time.time()
for i, text in enumerate(train_data2['text'].apply(preprocess)):
    encoded_input = tokenizer(text, max_length=20, padding='max_length', truncation=True, return_tensors='pt')
    batch_outputs = model(**encoded_input.to(device))['pooler_output'].to('cpu').detach().numpy()
    features.append(batch_outputs)
    del batch_outputs
    if i % 100 == 0:
        print(i)

end_time = time.time()
train_result, train_time_consumption = np.concatenate(features, axis=0), end_time - start_time
with open("train_result.npy", "wb") as f:
    np.save(f, train_result)

features = []
start_time = time.time()
for i, text in enumerate(test_data2['text'].apply(preprocess)):
    encoded_input = tokenizer(text, max_length=20, padding='max_length', truncation=True, return_tensors='pt')
    batch_outputs = model(**encoded_input.to(device))['pooler_output'].to('cpu').detach().numpy()
    features.append(batch_outputs)
    del batch_outputs
    if i % 100 == 0:
        print(i)

end_time = time.time()
test_result, test_time_consumption = np.concatenate(features, axis=0), end_time - start_time
with open("test_result.npy", "wb") as f:
    np.save(f, test_result)
  
print("train_time_consumption: ", timedelta(seconds=int(train_time_consumption)))
print("test_time_consumption: ", timedelta(seconds=int(test_time_consumption)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
0
100
200
300
400
500
600
700
800
900
train_time_consumption:  0:01:59
test_time_consumption:  0:00:20


In [21]:
with open("train_result.npy", "rb") as f:
  train_result = np.load(f)
with open("train_result.npy", "rb") as f:
  test_result = np.load(f)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

start_time = time.time()
cls_model = RandomForestClassifier()
cls_model.fit(pd.DataFrame(train_result), train_data2['label'].values)
print(f"Train Time: {timedelta(seconds=int(time.time()-start_time))}")
y_hat = cls_model.predict(pd.DataFrame(test_result))
acc = accuracy_score(test_data2['label'].values, y_hat)
print(f"Test ACC: {acc * 100:.2f}%")

Train Time: 0:00:10
Test ACC: 77.50%
