In [1]:
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib import request
import json
import csv
import transformers

In [2]:
page = '1'

In [3]:
# 較多好評
url = 'https://api.hahow.in/api/courses/5ebca40b454a0417c5880c8e/feedbacks?limit=20&page=2'

In [4]:
# 較多差評
url = 'https://api.hahow.in/api/courses/592182214781880700a4ae33/feedbacks?limit=20&page='+str(page)

In [5]:
# html = request.urlopen(url).read()
# soup = BeautifulSoup(html, 'html.parser')
# site_json_list = json.loads(soup.text)

In [6]:
# documents = []
# for feedback_js_dict in site_json_list:
#     documents.append(feedback_js_dict['description'])
# print('評論數: {}'.format(len(documents)))

In [7]:
# documents = [docu.replace('\n','') for docu in documents]

In [8]:
# documents

---

## 資料存入本地端

In [9]:
# with open('hahow_feedback.txt','w') as f:
#     for document in documents:
# #         print(document)
#         f.write(document+'\n')

In [10]:
# documents[0]

## 寫入csv 

In [11]:
# documents[2]

In [12]:
# with open('hahow_feedback.csv', 'a', newline='') as f:
#     writer = csv.writer(f, delimiter= ',')
# #     writer.writerow(['文本','情緒標記'])
#     for idx, doc in enumerate(documents):
# #         print(idx, end='\r')
#         try:
#             writer.writerow([doc])
#         except:
#             print('idx {} not witten to file'.format(idx))

In [13]:
# docu_json_str

In [14]:
# docu_json_str = json.dump(site_json_list)
# with open('hahow_feedback.json','w', encoding= 'utf-8') as f:
#         f.write(docu_json_str)

## 讀取本地端的feedback json檔

In [15]:
# with open('hahow_feedback.txt', 'r') as f:
# #     dcts = json.load(f) # json.load -> file 
#     fbs = f.readlines()

In [16]:
# fbs = [fb.replace('\n','') for fb in fbs]

In [17]:
# fbs[1]

## 讀取 csv 檔

In [18]:
df = pd.read_csv('hahow_feedback.csv', delimiter=',',encoding='ANSI')

In [19]:
print('文本數: {}\n'.format(len(df)))
print('情緒標記比例:\n{}'.format(df['情緒標記'].value_counts()))

文本數: 87

情緒標記比例:
1.0        40
0.0        22
1000.0     11
100.0      10
0.5         2
10000.0     1
10.0        1
Name: 情緒標記, dtype: int64


---

# 資料前處理

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

In [21]:
df.head(3)

Unnamed: 0,文本,情緒標記
0,上完課後，真的很慶幸有買到課程並真心覺得物超所值，對於新手的我，淺顯易懂很容易就了解製作po...,1.0
1,"是很值得的課, 對入門很有用",1.0
2,很棒的課，獲益良多！謝謝老師。,1.0


---
---

# Bert input 處理

In [22]:
from transformers import BertTokenizer

In [23]:
pretrained_model_name = 'bert-base-chinese'

In [24]:
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

### 測試 tokenizer

In [25]:
sample_txt = '很棒的課，獲益良多！謝謝老師。'

In [26]:
tokens = tokenizer.tokenize(sample_txt)

In [27]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [28]:
tokens

['很', '棒', '的', '課', '，', '獲', '益', '良', '多', '！', '謝', '謝', '老', '師', '。']

In [29]:
print(token_ids)

[2523, 3472, 4638, 6307, 8024, 4363, 4660, 5679, 1914, 8013, 6342, 6342, 5439, 2374, 511]


In [30]:
print('len of the tokens: {}'.format(len(tokens)))

len of the tokens: 15


### Special tokens

In [31]:
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


### Encode 句子(符合Bert要求input格式)

In [32]:
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length= 50,
#     truncation= True,
    padding= 'max_length',
    add_special_tokens=True,
#     pad_to_multiple_of=True,
    return_attention_mask= True,
    return_token_type_ids= False,
    return_tensors='pt')

In [33]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [34]:
print('txt encoded format for bert input:\n\n {}'.format(encoding['input_ids'][0]))

txt encoded format for bert input:

 tensor([ 101, 2523, 3472, 4638, 6307, 8024, 4363, 4660, 5679, 1914, 8013, 6342,
        6342, 5439, 2374,  511,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0])


----
----

## Pytorch Dataset 準備 

In [35]:
class docs_dataset(Dataset):
    
    def __init__(self, feedbacks, labels, tokenizer, max_length):
        self.feedbacks = feedbacks
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.feedbacks)
    
    def __getitem__(self, x):
        
        feedback = str(self.feedbacks.iloc[x])
        
        encoding = tokenizer.encode_plus(
            feedback,
#             truncation= True,
            max_length= self.max_length,
            padding = 'max_length',
            add_special_tokens=True,
#             pad_to_multiple_of=True,
            return_attention_mask= True,
            return_token_type_ids= False,
            return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label' : torch.tensor(self.labels.iloc[x], dtype= torch.long),
            'data_idx' : x
        }

## 為後續訓練分成batch 做 DataLoader 
首先把資料集劃分為訓練和測試資料

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
df_x = df['文本']
df_y = df['情緒標記']

In [38]:
train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size= 0.2, random_state= 42)

In [39]:
train_x.head(3)

55                                   跟我想的不一樣
73          課程內容簡介與實際落差甚大.....好後悔買這堂課程來觀看~~~
11    課程從podcast主題規劃開始介紹，由淺入深，非常適合新手來學習！謝謝老師
Name: 文本, dtype: object

### 建立 DataLoader 為後續模型訓練

In [40]:
batchsize = 2
maxlength = 400

### 已經將整份文本以8:2比例分成訓練和測試集，僅將訓練集建立dataloader

In [41]:
train_dataset = docs_dataset(feedbacks=train_x, labels= train_y, tokenizer = tokenizer, max_length= maxlength)

In [42]:
train_dataloader = DataLoader(train_dataset, batch_size=batchsize)

In [43]:
data = next(iter(train_dataloader))
data.keys()

dict_keys(['input_ids', 'attention_mask', 'label', 'data_idx'])

In [44]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['label'].shape)
print(data['data_idx'])

torch.Size([2, 400])
torch.Size([2, 400])
torch.Size([2])
tensor([0, 1])


----

# 學生回饋情感 分類器建立
這部分利用bert pretrained model 並針對實際問題(文本) fine tuning

In [45]:
import torch.nn as nn
from transformers import BertModel

In [46]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, dropout_p = 0.3):
        super(SentimentClassifier, self).__init__()
        self.bertModel = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(p= dropout_p)
        self.linear = nn.Linear(self.bertModel.config.hidden_size, n_classes)
        self.softmax = nn.Softmax(dim= 1)
    
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bertModel(input_ids, attention_mask)
        output = self.dropout(pooled_output)
        output = self.linear(output)
        return self.softmax(output)

https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

### 設定模型情緒分類模型

In [47]:
SentModel = SentimentClassifier(n_classes= 7)

## 未訓練前模型預測結果

In [56]:
for idx, dict_ in enumerate(train_dataloader):
#     print(dict_['input_ids'].shape)
    input_ids_batch = dict_['input_ids']
    masks_batch = dict_['attention_mask']
    labels_batch = dict_['label']
    data_idx_batch = dict_['data_idx']
    
    outputs= SentModel(input_ids_batch, masks_batch)
    _, predicted_cls_idxs_batch = torch.max(outputs.data, dim=1)
    
    print('訓練文本index: {}\n'.format(data_idx_batch))
    print('batch 中每個文本的class預測(未訓練前):\n{}\n'.format(predicted_cls_idxs_batch))
    print('batch 中每個文本的真實情緒標記:\n{}'.format(labels_batch))
    break
#     for i in range(len(data_idx_batch)):

測試文本index: tensor([0, 1])

batch 中每個文本的class預測(未訓練前):
tensor([0, 3])

batch 中每個文本的真實情緒標記:
tensor([1000,    0])


In [71]:
print('index 0 文本內容:\n\n{}\n\nindex 0 真實標記: {}'.format(train_x.iloc[0], int(train_y.iloc[0])))

index 0 文本內容:

跟我想的不一樣

index 0 真實標記: 1000


------
------

# 模型訓練

In [50]:
from torch import optim
from torchsummary import summary

### 參數設定

In [51]:
batch_size = 10
lr = 0.001
epochs = 20

optimizer = optim.SGD(SentModel.parameters(), lr= lr, momentum= 0.9, weight_decay= 0.0001)

In [52]:
# summary(SentModel, input_size= [(2,400),(2,400)])

In [53]:
def train():
    print('training...')

In [54]:
if __name__ == '__main__':
    train()

training...
