<a href="https://colab.research.google.com/github/Xubwei/Python-Web-Crawler/blob/main/DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 範例 : 輕量化 Bert 預訓練模型的使用方式

# [教學目標]
- 輕量化 Bert 預訓練模型的使用方式
- 切換模型並觀察影響

# [學習重點]
- 由下列範例, 觀察並理解當計算資源較為不足時, Bert 的輕量化解決方案
- 切換模型, 並觀察效果

# 程式說明
- 程式因為需要載入預訓練權重以及對應的 Tokenizer, 執行前請先安裝 transformers 套件
- 原始程式來源 : https://www.kaggle.com/naim99/disaster-tweets-classification-distilbert-bert

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat May 20 01:53:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# 用於檢查系統是否支持 CUDA，並且是否有可用的 GPU。
import torch
torch.cuda.is_available()

True

In [3]:
! nvidia-smi

Fri May 19 19:01:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
# 載入相關套件, 第一次執行前需安裝 transformers 套件
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import re, warnings
warnings.filterwarnings("ignore")

### 連結至GoogleDrive

In [6]:
from google.colab import drive

drive.mount('/content/gdrive')
# 此處需要登入google帳號
# 獲取授權碼之後輸入即可連動雲端硬碟

Mounted at /content/gdrive


In [4]:
# pip install torch
#!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m128.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [7]:
# 載入訓練與測試資料
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/NLP自然語言處理/Excercise_NLP_Disaster_Tweets/train.csv')
df_test = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/NLP自然語言處理/Excercise_NLP_Disaster_Tweets/test.csv')

# 前處理

In [8]:
# 前處理-1 : 消除連字
def decontracted(text):
    # 特殊連字
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)
    # 一般性連字
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text

df['text'] = df['text'].apply(lambda x: decontracted(x))
df_test['text'] = df_test['text'].apply(lambda x: decontracted(x))

In [9]:
# 前處理-2 : 清除特殊符號
import string
regular_punct = list(string.punctuation)
extra_punct = [
    ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']

# 消除標點符號以及上列符號
all_punct = list(set(regular_punct + extra_punct))

# 消除連字號 "-" 以及句號 "."
all_punct.remove('-')
all_punct.remove('.')

def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, f' {punc} ')
            # f' {punc} ' 是一個格式化字符串（formatted string），將特殊符號包裹在空格中。
            # 例如，如果特殊符號是句號（"."），那麼替換後的結果就是 " . "，在句號前後都有一個空格。
    return text

df['text'] = df['text'].apply(lambda x: spacing_punctuation(x))
df_test['text'] = df_test['text'].apply(lambda x: spacing_punctuation(x))

In [10]:
# 前處理-3 : 錯漏字修正
mis_connect_list = ['(W|w)hat', '(W|w)hy', '(H|h)ow', '(W|w)hich', '(W|w)here', '(W|w)ill']
mis_connect_re = re.compile('(%s)' % '|'.join(mis_connect_list))

mis_spell_mapping = {'whattsup': 'WhatsApp', 'whatasapp':'WhatsApp', 'whatsupp':'WhatsApp', 
                    'whatcus':'what cause', 'arewhatsapp': 'are WhatsApp', 'Hwhat':'what',
                    'Whwhat': 'What', 'whatshapp':'WhatsApp', 'howhat':'how that',
                    # why
                    'Whybis':'Why is', 'laowhy86':'Foreigners who do not respect China',
                    'Whyco-education':'Why co-education',
                    # How
                    "Howddo":"How do", 'Howeber':'However', 'Showh':'Show',
                    "Willowmagic":'Willow magic', 'WillsEye':'Will Eye', 'Williby':'will by'}
def spacing_some_connect_words(text):
    """
    'Whyare' -> 'Why are'
    """
    ori = text
    for error in mis_spell_mapping:
        if error in text:
            text = text.replace(error, mis_spell_mapping[error])

    # what
    text = re.sub(r" (W|w)hat+(s)*[A|a]*(p)+ ", " WhatsApp ", text)
    text = re.sub(r" (W|w)hat\S ", " What ", text)
    text = re.sub(r" \S(W|w)hat ", " What ", text)
    # why
    text = re.sub(r" (W|w)hy\S ", " Why ", text)
    text = re.sub(r" \S(W|w)hy ", " Why ", text)
    # How
    text = re.sub(r" (H|h)ow\S ", " How ", text)
    text = re.sub(r" \S(H|h)ow ", " How ", text)
    # which
    text = re.sub(r" (W|w)hich\S ", " Which ", text)
    text = re.sub(r" \S(W|w)hich ", " Which ", text)
    # where
    text = re.sub(r" (W|w)here\S ", " Where ", text)
    text = re.sub(r" \S(W|w)here ", " Where ", text)

    text = mis_connect_re.sub(r" \1 ", text)
    text = text.replace("What sApp", 'WhatsApp') 
    return text

df['text'] = df['text'].apply(lambda x: spacing_some_connect_words(x))
df_test['text'] = df_test['text'].apply(lambda x: spacing_some_connect_words(x))

In [11]:
df.head() 

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this # earthquake...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to ' shelter in place ' ...,1
3,6,,,"13 , 000 people receive # wildfires evacuatio...",1
4,7,,,Just got sent this photo from Ruby # Alaska a...,1


# 載入 distilBERT 模型或 Bert 模型, 將文字編碼

簡單來說，這段程式碼設置了使用 DistilBERT 模型和相應的 tokenizer，並從預訓練的權重中載入它們。這樣就可以使用這個已經在大型文本數據上訓練過的模型來進行文本相關的任務


In [12]:
# 載入 distilBERT 模型或 Bert 模型 (下列兩行中, 將不選的模型註解掉即可)
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# 載入預訓練權重以及 tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


- 這一行將模型類別（model_class）、tokenizer類別（tokenizer_class）和預訓練權重（pretrained_weights）設置為 DistilBERT 模型和相應的 tokenizer，並指定使用的預訓練權重為 'distilbert-base-uncased'，這是一個已經在大型文本數據上訓練過的 DistilBERT 模型。
- 這一行根據指定的預訓練權重（'distilbert-base-uncased'）來載入 DistilBERT 的 tokenizer。這個 tokenizer 將被用於將文本轉換為模型可以理解的輸入格式，將其分割成標記（tokens）並進行相應的編碼。
- 這一行根據指定的預訓練權重（'distilbert-base-uncased'）來載入 DistilBERT 模型。這個預訓練的模型將被用於後續的任務，如文本分類或情感分析。

In [13]:
# 調整訓練資料的大小 (可取消, 若不取消表示取前4000筆訓練)
df = df[:4000]

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# 將訓練資料經由 distilBERT 或 Bert 轉換為 Embedding 編碼

將文本轉換為嵌入編碼的過程通常涉及使用預訓練的語言模型，如BERT或GloVe等。這些模型在大型文本數據集上進行了預訓練，學習到了單詞之間的語義關係。然後，我們可以使用這些預訓練模型來將新的文本轉換為嵌入編碼，以便進行下游的任務，如文本分類、情感分析、語義相似度等。

In [14]:
# 將訓練資料經過 tokenizer 編碼轉換
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
# 在這段程式碼中，我們使用了 tokenizer.encode 方法來將每個文本進行編碼轉換。
# tokenizer 是之前載入的預訓練 tokenizer。

In [15]:
tokenized

0       [101, 2256, 15616, 2024, 1996, 3114, 1997, 202...
1       [101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...
2       [101, 2035, 3901, 2356, 2000, 1005, 7713, 1999...
3       [101, 2410, 1010, 2199, 2111, 4374, 1001, 3748...
4       [101, 2074, 2288, 2741, 2023, 6302, 2013, 1009...
                              ...                        
7608    [101, 2048, 5016, 27083, 3173, 1037, 2958, 785...
7609    [101, 1030, 9342, 1035, 6289, 19848, 2100, 103...
7610    [101, 23290, 1012, 6365, 1031, 5890, 1024, 584...
7611    [101, 2610, 11538, 2044, 2019, 1041, 1011, 799...
7612    [101, 1996, 6745, 1024, 2062, 5014, 10958, 542...
Name: text, Length: 7613, dtype: object

In [16]:
df['text']

0       Our Deeds are the Reason of this  # earthquake...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to  ' shelter in place '  ...
3       13 , 000 people receive  # wildfires evacuatio...
4       Just got sent this photo from Ruby  # Alaska a...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609     @ aria _ ahrary  @ TheTawniest The out of con...
7610    M1.94  [ 01 : 04 UTC ]  ? 5km S of Volcano Haw...
7611    Police investigating after an e-bike collided ...
7612    The Latest :  More Homes Razed by Northern Cal...
Name: text, Length: 7613, dtype: object

In [17]:
# 以最長字串為準, 將訓練資料補零成相同長度
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

(自己的筆電會遇到不夠資源運行模型的問題)

In [18]:
# 設定 attention_mask, 將計算經過 Bert 生成的 Embedding 結果, 儲存於 last_hidden_states 中
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded).to(torch.int64)
attention_mask = torch.tensor(attention_mask).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
# # 設定 attention_mask，將計算經過 Bert 生成的 Embedding 結果，儲存於 last_hidden_states 中
# attention_mask = np.where(padded_train != 0, 1, 0)
# input_ids = torch.LongTensor(padded_train).to(device)
# attention_mask = torch.LongTensor(attention_mask).to(device)

# training_data = TensorDataset(input_ids, attention_mask)
# training_loader = DataLoader(training_data, 100)

# last_hidden_states = []
# with torch.no_grad():
#     for batch in training_loader:
#         ids, mask = [x.to(device) for x in batch]
#         last_hidden_states.append(model(ids, attention_mask=mask)[0][:, 0,:].cpu())
# last_hidden_states = torch.cat(last_hidden_states, dim=0)

In [19]:
# 準備下一階段要用的特徵 (上階段 Embedding 結果) 與目標值
labels = df['target']
features = last_hidden_states[0][:,0,:].numpy()
features[0].shape

(768,)

In [20]:
# 切割訓練 / 測試集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

# 使用 Logistic Regression 當作最後一層, 輸出預測結果

In [22]:
# 對 Logistic Regression 跑參, 相當於加上單層類神經網路
import sklearn
from sklearn.model_selection import GridSearchCV
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)
print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 47.36847368421053}
best scrores:  0.8120506023689547


In [23]:
# 將上一格跑出的 Logistic Regression 最佳 C 值填入, 觀察測試集的驗證分數
lr_clf = LogisticRegression(C = 47.36847368421053)  
lr_clf.fit(train_features, train_labels)
lr_clf.score(test_features, test_labels)

0.792016806722689

# 對預測目標資料做出最終預測

In [24]:
# 將預測目標資料經過 tokenizer 編碼轉換
tokenized_t = df_test['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [25]:
# 以最長字串為準, 將預測目標資料補零成相同長度
max_len = 0
for i in tokenized_t.values:
    if len(i) > max_len:
        max_len = len(i)
        
padded_t = np.array([i + [0]*(max_len-len(i)) for i in tokenized_t.values])
np.array(padded_t).shape

(3263, 73)

In [26]:
# 設定 attention_mask, 將計算經過 Bert 生成的 Embedding 結果, 儲存於 last_hidden_states 中
attention_mask_t = np.where(padded_t != 0, 1, 0)
input_ids = torch.tensor(padded_t).to(torch.int64)
attention_mask_t = torch.tensor(attention_mask_t).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask_t)

In [27]:
# 輸出預測目標資料的預測結果
val_features = last_hidden_states[0][:,0,:].numpy() 
y_pred = lr_clf.predict(val_features)
y_pred

array([1, 1, 1, ..., 1, 1, 0])

In [28]:
# 生成提交擋
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['target'] = y_pred
submission.to_csv('submission_DistilBert.csv', index=False)

In [29]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
