# 第十二週：BERT (Encoder-only-model)
Last modified: 陳文薇 (2024/05/29)<br><br>


### 大綱：
1. 載入套件
2. 資料前處理
3. Sequence classificatio<br>
  3.1 Sentiment Classification<br>



連接雲端資料夾

In [None]:
import os

from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/sma/proj') # 請將這行修改為自己的 google drive 路徑
os.listdir() # 確認目錄內容

Mounted at /content/drive


['stanford-corenlp.zip',
 'stanza_corenlp',
 'stanford-corenlp-latest.zip',
 'stanford-corenlp',
 'corenlp_server-c7683579fc3048c8.props',
 'corenlp_server-8421c828ca0a4b27.props',
 'corenlp_server-33f5eb266f804605.props',
 'corenlp_server-5a0c78ec523c4fbf.props']

前處理常用套件

In [None]:
#!pip install jieba

In [None]:
import pandas as pd
import re
import numpy as np
from collections import defaultdict
import multiprocessing
import jieba
import matplotlib.pyplot as plt
from matplotlib.font_manager import fontManager

# 設定字體
fontManager.addfont('../TaipeiSansTCBeta-Regular.ttf')
plt.rcParams['font.sans-serif'] = ['Taipei Sans TC Beta']
plt.rcParams['font.size'] = '16'

transformers 和 Sentence-transformers （使用 huggingface 模型）

In [None]:
#pip install sentence_transformers
#!pip install ckip_transformers

In [None]:
from transformers import BertTokenizerFast, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

BERTopic

In [None]:
#!pip install bertopic

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

### 2. 資料前處理

載入釣魚信資料集

In [None]:
# 讀入中文示範資料集
maildata = pd.read_csv('../raw_data/Phishing_Email.csv')

In [None]:
#去除文章內容為空值的筆數
maildata.dropna(subset = ['Email Text'], axis=0, how='any', inplace=True)

#新增['sentence']欄位，用'。'取代'\n\n'，並移除'\n'
maildata['sentence'] = maildata['Email Text'].str.replace(r'\n\n','。', regex=True)
maildata['sentence'] = maildata['sentence'].str.replace(r'\n','', regex=True)
# 刪除標點符號/數字/換行符號
maildata["sentence"] = maildata["sentence"].apply(lambda x: re.sub(r'[^\w\s]','', str(x)))  #只留下英文字母和空格(包含換行符號)
maildata['sentence'] = maildata["sentence"].str.replace(r'[\d]+', '', regex=True).astype(str) #去除數字
maildata["sentence"] = maildata["sentence"].apply(lambda x: re.sub(r'[\n_-]+',' ', x)) #將換行符號替換成空格


# 去除空值
# dataset 有些郵件是 empty
# 經過 nltk 套件會有些東西變 nan
maildata = maildata[maildata["sentence"] != "empty"]
maildata = maildata[maildata["sentence"] != "nan"]

#移除內文中的網址
maildata['sentence'] = maildata['sentence'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
maildata['sentence'] = maildata['sentence'].str.split("[,，。！!？?]{1,}")

maildata = maildata.explode('sentence').reset_index(drop=True)
# 把長度小於1的sentence刪掉
maildata = maildata[maildata["sentence"].str.len() > 1]
maildata = maildata.sample(n=100, random_state=222)
# 將不必要的欄位 以及 nan 刪掉
maildata = maildata.rename(columns={"Unnamed: 0":"index"})
maildata.head(5)

Unnamed: 0,index,Email Text,Email Type,sentence
6974,7192,\nNow you can have HUNDREDS of lenders compete...,Phishing Email,Now you can have HUNDREDS of lenders compete f...
15453,15914,PUBLIC ANNOUNCEMENT:The new .NAME domain exten...,Phishing Email,PUBLIC ANNOUNCEMENTThe new NAME domain extensi...
3275,3386,get the best price on your next car ! exclusiv...,Phishing Email,get the best price on your next car exclusive...
4186,4333,"On Thu, 1 Aug 2002 17:10:48 +0100, John Hinsle...",Safe Email,On Thu Aug John Hinsley wrote No the prob...
6008,6198,santa barbara nexus ezine - great article pert...,Phishing Email,santa barbara nexus ezine great article perta...


In [None]:
Phishing_mails = maildata[maildata["Email Type"] == 'Phishing Email']
Phishing_mails.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, 6974 to 11697
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   index       37 non-null     int64 
 1   Email Text  37 non-null     object
 2   Email Type  37 non-null     object
 3   sentence    37 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.4+ KB


#### Sentiment Classification
使用 Huggingface 上面已經針對 Sentiment classification 任務 finetune 的 BERT 模型來實作<br>
情緒：<br>
1: Semi-negation<br>
2: Negation<br>
3: Neutral<br>
4: Semi-positive<br>
5: Positive

In [None]:
# 載入已經被 fine-tune 過的 BERT 模型
model_name = "techthiyanes/chinese_sentiment"  # 你可以將這裡換成你想要使用的模型
model = pipeline('sentiment-analysis', model=model_name)


In [None]:
# 建新的 dataframe 儲存結果
results_df = pd.DataFrame(columns=['sentence', 'label', 'score'])
results_df['sentence'] = Phishing_mails['sentence']

# 情緒分析之函式
def analyze_sentiment(sentence):
    # 確保文本的長度不超過512個token
    max_length = 512
    truncated_sentence = sentence[:max_length]
    result = model([truncated_sentence])
    return pd.Series([result[0]['label'], result[0]['score']])

# 使用 apply 函数做情緒分析
results_df[['label', 'score']] = results_df['sentence'].apply(analyze_sentiment)

# 输出结果
results_df

Unnamed: 0,sentence,label,score
6974,Now you can have HUNDREDS of lenders compete f...,star 5,0.472031
15453,PUBLIC ANNOUNCEMENTThe new NAME domain extensi...,star 4,0.534208
3275,get the best price on your next car exclusive...,star 5,0.446151
6008,santa barbara nexus ezine great article perta...,star 4,0.533679
13009,Big and bigMAIN PAGEHuge big titties bigbigsc...,star 4,0.565446
14996,fwd finally a smart sp m control solution y...,star 5,0.395988
16620,buy office xp for fifty bucks percentage htmlh...,star 5,0.301693
14334,Â Â Hi Jm Thanks ...,star 1,0.378892
7684,you don t know how to attract customers to y...,star 5,0.42477
13502,work at home a month earn extra income from...,star 5,0.344362


結果顯示大多數釣魚信件情緒偏正面