In [113]:
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import requests
from urllib import request
import json
import csv
import transformers

In [2]:
url = 'https://api.hahow.in/api/courses/5ebca40b454a0417c5880c8e/feedbacks?limit=20&page=0'

In [13]:
# html = request.urlopen(url).read()
# soup = BeautifulSoup(html, 'html.parser')
# site_json_list = json.loads(soup.text)

In [107]:
# documents = []
# for feedback_js_dict in site_json_list:
#     documents.append(feedback_js_dict['description'])
# print('評論數: {}'.format(len(documents)))

In [100]:
# documents = [docu.replace('\n','') for docu in documents]

---

## 資料存入本地端

In [106]:
# with open('hahow_feedback.txt','w') as f:
#     for document in documents:
# #         print(document)
#         f.write(document+'\n')

In [135]:
# documents[0]

## 寫入csv 

In [159]:
# with open('hahow_feedback.csv', 'w', newline='') as f:
#     writer = csv.writer(f, delimiter= ',')
#     writer.writerow(['文本','情緒標記'])
#     for doc in documents:
#         writer.writerow([doc])

In [63]:
# docu_json_str

In [62]:
# docu_json_str = json.dump(site_json_list)
# with open('hahow_feedback.json','w', encoding= 'utf-8') as f:
#         f.write(docu_json_str)

## 讀取本地端的feedback json檔

In [103]:
# with open('hahow_feedback.txt', 'r') as f:
# #     dcts = json.load(f) # json.load -> file 
#     fbs = f.readlines()

In [108]:
# fbs = [fb.replace('\n','') for fb in fbs]

In [110]:
# fbs[1]

'是很值得的課, 對入門很有用'

## 讀取 csv 檔

In [173]:
df = pd.read_csv('hahow_feedback.csv', delimiter=',',encoding='ANSI')

---

# 資料前處理

In [218]:
import torch
from torch.utils.data import Dataset, DataLoader

In [178]:
df.head(3)

Unnamed: 0,文本,情緒標記
0,上完課後，真的很慶幸有買到課程並真心覺得物超所值，對於新手的我，淺顯易懂很容易就了解製作po...,
1,"是很值得的課, 對入門很有用",
2,很棒的課，獲益良多！謝謝老師。,


# Bert input 處理

In [181]:
from transformers import BertTokenizer

In [182]:
pretrained_model_name = 'bert-base-chinese'

In [183]:
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

### 測試 tokenizer

In [184]:
sample_txt = '很棒的課，獲益良多！謝謝老師。'

In [185]:
tokens = tokenizer.tokenize(sample_txt)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [186]:
tokens

['很', '棒', '的', '課', '，', '獲', '益', '良', '多', '！', '謝', '謝', '老', '師', '。']

In [189]:
print(token_ids)

[2523, 3472, 4638, 6307, 8024, 4363, 4660, 5679, 1914, 8013, 6342, 6342, 5439, 2374, 511]


In [203]:
print('len of the tokens: {}'.format(len(tokens)))

len of the tokens: 15


### Special tokens

In [193]:
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


### Encode 句子(符合Bert要求input格式)

In [196]:
encoding = tokenizer.encode_plus(
    sample_txt,
    truncation= True,
    add_special_tokens=True,
    pad_to_multiple_of=True,
    return_attention_mask= True,
    return_token_type_ids= False,
    return_tensors='pt')

In [198]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [206]:
print('txt encoded format for bert input:\n\n {}'.format(encoding['input_ids'][0]))

txt encoded format for bert input:

 tensor([ 101, 2523, 3472, 4638, 6307, 8024, 4363, 4660, 5679, 1914, 8013, 6342,
        6342, 5439, 2374,  511,  102])


## Pytorch Dataset 準備 

In [209]:
class docs_dataset(Dataset):
    
    def __init__(self, feedbacks, labels, tokenizer):
        self.feedbacks = feedbacks
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.feedbacks)
    
    def __getitem__(self, x):
        
        feedback = str(self.feedbacks[x])
        
        encoding = tokenizer.encode_plus(
            feedback,
            truncation= True,
            add_special_tokens=True,
            pad_to_multiple_of=True,
            return_attention_mask= True,
            return_token_type_ids= False,
            return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'],
            'attention mask': encoding['attention mask'],
            'label' : torch.tensor(self.labels[x], dtype= torch.long)
        }

## 為後續訓練分成batch 做 DataLoader 
首先把資料集劃分為訓練和測試資料

In [210]:
from sklearn.model_selection import train_test_split

In [212]:
df_x = df['文本']
df_y = df['情緒標記']

In [215]:
train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size= 0.2, random_state= 42)

In [217]:
train_x.head(3)

8                               完整的內容與清晰的課程，讓人明確知道如何實踐。
5     不管是ID3 tag或是上架的部分，這幾塊都是我自己摸索卡了好久的地方，整理的都很直白容易理...
11               課程從podcast主題規劃開始介紹，由淺入深，非常適合新手來學習！謝謝老師
Name: 文本, dtype: object

### 建立 DataLoader 為後續模型訓練

In [220]:
batchsize = 2

In [219]:
train_dataset = docs_dataset(feedbacks=train_x, labels= train_y, tokenizer = tokenizer)

In [221]:
train_dataloader = DataLoader(train_dataset, batch_size=batchsize)