# Version 1

In [None]:
root_path = "/content/drive/MyDrive/CSI5386/categoriespoems/"
cate_list = ['Activities', 'Arts & Sciences', 'Living', 'Love', 'Mythology & Folklore', 'Nature', 'Relationships', 'Religion', 'Social Commentaries']

In [None]:
import os

In [None]:
def formattingData(root_path, cate_list):
    '''
    Reading data from files and preprocessing data including removing extra blank space and extra categories.
    '''
    count = 0
    title_list = []
    author_list = []
    labels_list = []
    poem_list = []
    file_list = []
    for cate in cate_list:
        cate_path = root_path + cate
        print("Processing on %s" % cate_path)
        for maindir, subdir, file_name_list in os.walk(cate_path):
            for filename in file_name_list:
                if filename.split('.')[-1] == 'txt':
                    if filename not in file_list:
                        file_list.append(filename)
                        fpath = os.path.join(maindir, filename)
                        try:
                            with open(fpath, 'r') as f:
                                fcontent = f.read().split('\n')
                                title = fcontent[0]
                                author = fcontent[1]
                                labels = fcontent[5].split(',')
                                poem = fcontent[9:]
                                poem = " ".join("".join(poem).split())
                                poem_list.append(poem)
                                labels_list.append(labels)
                                title_list.append(title)
                                author_list.append(author)
                        except:
                            print("Error on %s" % fpath)
                            pass
    
    for labels in labels_list:
        # remove extra blank space
        for i in range(len(labels)):
            labels[i] = labels[i].strip()
        l_copy = labels.copy()
        # remove categories which doesn't belong to the major 9 categories
        for label in l_copy:
            if label not in cate_list:
                labels.remove(label)
    
    for i in range(len(labels_list)):
        labels_list[i] = ",".join(labels_list[i])

    # Transfer to DataFrame 
    data_dict = {
        "Poem":poem_list,
        "Tags":labels_list
    }

    data = pd.DataFrame(data_dict)
    return data


In [None]:
data = formattingData(root_path, cate_list)

Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Activities
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Arts & Sciences
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Living
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Love
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Mythology & Folklore
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Nature
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Relationships
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Religion
Processing on /content/drive/MyDrive/CSI5386/categoriespoems/Social Commentaries


In [None]:
data.head()

Unnamed: 0,Poem,Tags
0,After reading Ash Wednesday she looked once at...,"Religion,Arts & Sciences,Activities"
1,"Return The taste is strong as ever, figs and c...","Arts & Sciences,Relationships,Nature,Activitie..."
2,The gregarious dark is shifting when she puts ...,"Love,Living,Relationships,Activities"
3,It seemed those rose-pink dishes she kept for ...,"Living,Relationships,Activities,Social Comment..."
4,"STUDY IN WHITES Wax-white-- Floor, ceiling, wa...",Activities


In [None]:
% pip install pytorch-pretrained-bert
% pip install transformers


Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 8.5MB/s 
[?25hCollecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/9b/e5/96f7156e6ebf7ab992471479c3c55f0be2f31360fcdcac21aa6f782c036a/boto3-1.17.57-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 14.3MB/s 
Collecting s3transfer<0.5.0,>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/63/d0/693477c688348654ddc21dcdce0817653a294aa43f41771084c25e7ff9c7/s3transfer-0.4.2-py2.py3-none-any.whl (79kB)
[K     |████████████████████████████████| 81kB 8.6MB/s 
[?25hCollecting botocore<1.21.0,>=1.20.57
[?25l  Downloading https://files.pythonhosted.org/packages/94/52/aa266c9594e279799ded419caac56365796ce686b97762b9c8620b2ba988/botocore-1.20.57-py2.py3-none-any.whl 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import torch
from os.path import dirname , realpath
import pandas as pd
from tqdm import tqdm 
from pdb import set_trace
# from pytorch_pretrained_bert import BertTokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def get_current_project_directory():
    return dirname(realpath(__file__))

def tags_to_binary(tags):
    tag_list = tags.split(',')
    # binary_category = [0] * len(top_category) The number of major category are 9 
    binary_category = [0] * 9
    flag = False
    for t in tag_list:
        if t in cate_list:
            binary_category[cate_list.index(t)] = 1
            flag = True
    if not flag:
        binary_category = binary_category
    return binary_category


def torch_dataset(df, labels , label='Label'):
    from torch.utils.data import TensorDataset
    input_ids = []
    attention_masks = []
    
    for poem in tqdm(df.Poem):
        encoded_dict = tokenizer.encode_plus(
            poem,
            max_length = 512,           # Pad & truncate all sentences.
            pad_to_max_length = True,
            return_attention_mask = True,   # Construct attn. masks.
            return_tensors = 'pt',     # Return pytorch tensors.  
        )
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.to_numpy())

    dataset = TensorDataset(input_ids, attention_masks, labels)
    # with open(f'/content/drive/MyDrive/CSI5386/dataset.pkl', 'wb') as f:
    with open(f'/content/drive/MyDrive/CSI5386/512size_dataset.pkl', 'wb') as f:
        pickle.dump(dataset,  f)
    return dataset

In [None]:

labels = data['Tags'].apply(tags_to_binary)
    
labels = pd.DataFrame(
    data=np.array(labels.to_numpy().tolist()),
    columns=cate_list)

torch_dataset(data, labels)
s = pickle.load(
    open(f"/content/drive/MyDrive/CSI5386/256size_dataset.pkl","rb"))
# s = pickle.load(
#     open(f"/content/drive/MyDrive/CSI5386/dataset.pkl","rb"))

  0%|          | 0/11650 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 11650/11650 [01:20<00:00, 143.84it/s]


In [None]:
f = open(f'{root_path+cate_list[0]}/Eating & Drinking/poem.11288.txt', 'r')


In [None]:
f.read()

'Remarks on Poetry and the Physical World\nMary Barnard\n20\n\n\nReligion, Arts & Sciences, Eating & Drinking, Christianity, Activities, Poetry & Poets\n\nFree Verse\n\nAfter reading Ash Wednesday \nshe looked once at the baked beans   \nand fled. Luncheonless, poor girl,   \nshe observed a kind of poetic Lent-- \nand I had thought I liked poetry   \nbetter than she did. \n\nI do. But to me its most endearing \nquality is its unsuitableness; \nand, conversely, the chief wonder in heaven   \n(whither I also am sometimes transported)   \nis the kind of baggage I bring with me. \n\nSurely there is no more exquisite jointure   \nin the anatomy of life than that at which   \npoetry dovetails with the inevitable meal   \nand Mrs. B. sits murmuring of avocados. \n\n'

In [None]:
! git clone https://github.com/Riotpiaole/csi5386_bert_poem_clfier.git

Cloning into 'csi5386_bert_poem_clfier'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 68 (delta 37), reused 38 (delta 16), pack-reused 0[K
Unpacking objects: 100% (68/68), done.


### Error because of path setting

In [None]:
!python train_multi_class.py

2021-04-24 19:04:16.914464: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB

Training...

  Average training loss: 17250.11
  Training epcoh took: 0:09:07

Running Validation...
recall 20.00 % precision 25.00 % f1 score 22.22 %
recall 42.86 % precision 100.00 % f1 score 60.00 %
recall 28.57 % precision 66.67 % f1 score 40.00 %
recall 25.00 % precision 66.67 % f1 score 36.36 %
recall 30.77 % precision 80.00 % f1 score 44.44 %
recall 50.00 % precision 100.00 % f1 score 66.67 %
recall 12.50 % precision 100.00 % f1 score 22.22 %
recall 14.29 % precision 100.00 % f1 score 25.00 %
recall 37.50 % precision 75.00 % f1 score 50.00 %
recall 28.57 % precision 66.67 % f1 score 40.00 %
recall 66.67 % precision 66.67 % f1 score 66.67 %
recall 20.00 % precision 100.00 % f1 score 33.33 %
recall 50.00 % precision 66.67 % f1 score 57.14 %
recall 33.33 % precision 

### 2nd attempt

In [None]:
!python train_multi_class.py

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
recall 66.67 % precision 66.67 % f1 score 66.67 %
recall 44.44 % precision 57.14 % f1 score 50.00 %
recall 50.00 % precision 71.43 % f1 score 58.82 %
recall 28.57 % precision 66.67 % f1 score 40.00 %
recall 50.00 % precision 40.00 % f1 score 44.44 %
recall 57.14 % precision 50.00 % f1 score 53.33 %
recall 87.50 % precision 87.50 % f1 score 87.50 %
recall 87.50 % precision 70.00 % f1 score 77.78 %
recall 62.50 % precision 71.43 % f1 score 66.67 %
recall 60.00 % precision 60.00 % f1 score 60.00 %
recall 55.56 % precision 62.50 % f1 score 58.82 %
recall 66.67 % precision 50.00 % f1 score 57.14 %
recall 71.43 % precision 71.43 % f1 score 71.43 %
recall 71.43 % precision 83.33 % f1 score 76.92 %
recall 50.00 % precision 71.43 % f1 score 58.82 %
recall 85.71 % precision 100.00 % f1 score 92.31 %
recall 55.56 % precision 71.43 % f1 score 62.50 %
recall 33.33 % precision 57.14 % f1 score 42.11 %
recall 75.00 % precision 90.00 % f1 score 81.82 %
recall 5