# install

In [7]:
!pip install torch
!pip install transformers
!pip install sentencepiece
!pip install protobuf
!pip install gensim
!pip install -U scikit-learn
!pip install pandas

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
     ---------------------------------------- 0.0/123.5 kB ? eta -:--:--
     -------------------------------------- 123.5/123.5 kB 7.1 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.10.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 42.0/42.0 kB ? eta 0:00:00
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-none-win_amd64.whl.metadata (6.8 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-none-win_amd64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)




# data_reader

In [1]:
import itertools
import dataclasses
from typing import Iterable, Optional, Union, Dict, Tuple, List


@dataclasses.dataclass
class Sentence:
    id: int
    text: str
    context: str
    label: str

    def to_dict(self) -> Dict[str, Union[str, int]]:
        return dataclasses.asdict(self)


def balance_targets(sentences: Iterable[Sentence], method: str = "downsample_o_cat", shuffle=True) \
        -> Iterable[Sentence]:
    """
    Oversamples and/or undersamples training sentences by a number of targets.
    This is useful for linear shallow classifiers, that are prone to simply overfit the most-occurring category.
    See the source code for a documentation of resample methods logic
    :param shuffle: whether to shuffle the output
    :param sentences: sentences to resample
    :param method: resample method, one of {downsample_o_cat, downsample_o_pzk_cats, all_upsampled, remove_o_cat}
    :return: resampled, possibly shuffled input sentences
    """
    import random
    # take the second-top count from categories apart from "Other"
    targets = [s.label for s in sentences]
    second_top_count = sorted([sum([target == cat for target in targets]) for cat in set(targets) - {"O"}])[-2]
    if method == "downsample_o_cat":
        # downsample "other" category to second-most-occurring category count
        out_sentences = list((random.sample([s for s in sentences if s.label == "O"], second_top_count) +
                         [s for s in sentences if s.label != "O"]))
    elif method == "downsample_o_pzk_cats":
        # downsample "other" + "P_ZK" (experience description) category to third-most-occurring category count
        out_sentences = list((random.sample([s for s in sentences if s.label == "O"], second_top_count) +
                         [s for s in sentences if s.label != "O"]))
        out_sentences = list((random.sample([s for s in out_sentences if s.label == "P_ZK"], second_top_count) +
                         [s for s in out_sentences if s.label != "P_ZK"]))
    elif method == "all_upsampled":
        # upsample all categories to a count of most-occurring one (presumably "other" category)
        from itertools import chain
        out_sentences = list(itertools.chain(*[random.choices([s for s in sentences if s.label == cat],
                                                              k=second_top_count) for cat in set(targets)]))
    elif method == "remove_o_cat":
        # completely remove sentences of "other" category
        out_sentences = [s for s in sentences if s.label != "O"]
    else:
        out_sentences = sentences
    if shuffle:
        # random shuffle output sentences
        random.shuffle(out_sentences)
    return out_sentences


def get_sentence_vertical(sentences_dir: str, confidence_thrd: Optional[int] = 0) -> 'DataFrame':
    """
    Creates a tab-separated csv table with sentences_text, tags, users and sources, in out_table_path
    :param sentences_dir: directory of input sentences, divided to [train, val, test] subdirectories
    :param confidence_thrd: minimal mean confidence threshold of the retrieved sentences
    :return: Dataframe with attributes of retrieved sentences
    """
    from itertools import chain
    import pandas as pd  # if you need this, run 'pip install pandas==1.2.1'
    from utils.dataset import ReflexiveDataset

    sentences_splits = [ReflexiveDataset.sentences_from_tsv(sentences_dir, dataset_type, confidence_thrd,
                                                            use_context=True)
                        for dataset_type in ["train", "val", "test"]]
    out_vertical = pd.DataFrame.from_records([s.to_dict() for s in chain(*sentences_splits)])
    return out_vertical


def split_text_to_sentence_context(text: str, sep_chars: Tuple[str] = (".", "?", "!")) -> List[Tuple[str, str]]:
    """
    Splits the input text to sentences with the corresponding context,
    in the format compliant with the training of NeuralClassifier
    :param text: Full input paragraph, e.g. whole reflective diary, to extract the sentences to classify
    :param sep_chars: characters separating potential sentences
    """
    out_sentences = []
    current_sent = []
    words = text.split()

    for w_i, word in enumerate(words):
        current_sent.append(word)
        is_last_or_is_upper = (w_i == len(words)-1 or words[w_i+1][0].isupper())
        if any(word.endswith(mark) for mark in sep_chars) and is_last_or_is_upper:
            out_sentences.append(" ".join(current_sent))
            current_sent = []

    for sent_i, sent in enumerate(out_sentences):
        context = " ".join(out_sentences[sent_i-2:sent_i+2])
        yield sent, context


# dataset

In [2]:
import os
from enum import Enum
from pathlib import Path
from typing import List, Union, Dict
import ast

from filelock import FileLock
from torch.utils.data import Dataset
from transformers import AutoTokenizer, PreTrainedTokenizer, InputFeatures, logging
import pandas as pd
# from .data_reader import Sentence

logger = logging.get_logger()

# original labels:
# LABELS = ["O", "OS_PRES", "PERS", "POC", "P_ZK", "REF_P", "UV_OBT", "VY_IN", "VY_VY"]

LABELS = ["Other", "Belief", "Perspective", "Feeling", "Experience",
          "Reflection", "Difficulty", "Intention", "Learning"]


class Split(Enum):
    train = "train"
    eval = "eval"
    test = "test"


class ReflexiveDataset(Dataset):
    
    def __init__(self, sentences_dir: str, dataset_type: str, cache_dir: str, label_list: List[str],
                 tokenizer: Union[AutoTokenizer, PreTrainedTokenizer],
                 use_context=True, mean_confidence_threshold: int = 5):
        self.sentences_dir = sentences_dir
        self.confidence_thrd = mean_confidence_threshold
        self.dataset_type = dataset_type
        self.tokenizer = tokenizer
        self.label_list = label_list
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.use_context = use_context

        if not Path(cache_dir).exists():
            Path(cache_dir).mkdir(parents=True, exist_ok=True)

        cached_features_file = os.path.join(
            cache_dir,
            "cached_{}_{}_{}".format(dataset_type, tokenizer.__class__.__name__, str(self.tokenizer.model_max_length)),
        )
        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):
            logger.info(f"Creating features from reflexive diaries")
            self.sentences = self.sentences_from_tsv(sentences_dir, dataset_type, self.confidence_thrd, self.use_context)
            self.features = self.convert_examples_to_features(self.sentences)

    @staticmethod
    def sentences_from_tsv(sentences_dir: str, dataset_type: str,
                           confidence_thrd: int, use_context: bool) -> List[Sentence]:
        """Creates sentences for the training, eval and test sets."""
        tsv_path = os.path.join(sentences_dir, dataset_type, "sentences.tsv")
        df = pd.read_csv(tsv_path, sep='\t')
        df.sentence = df.sentence.fillna("")
        df.context = df.context.fillna("")
        sentences = []
        # group by sources, iterate every group separately, to avoid context overlays
        for idx, row in enumerate(df.itertuples()):
            confidences = ast.literal_eval(row.confidence)
            if sum(confidences) / len(confidences) >= confidence_thrd:
                sentences.append(Sentence(id=row.idx, text=row.sentence,
                                          context=row.context if use_context else None,
                                          label=row.y))
        logger.info("Retrieving %s of all %s %s sentences, over threshold %s" %
                    (len(sentences), len(df), dataset_type, confidence_thrd))
        return sentences
    
    def convert_examples_to_features(self, examples: List[Sentence]) -> List[Dict[str, List[int]]]:

        batch_encoding = self.tokenizer(
            text=[example.text.strip() for example in examples],
            text_pair=[e.context.strip() for e in examples] if self.use_context else None,
            padding="max_length",
            truncation=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding if k != "token_type_ids"}
            inputs["label"] = self.label_map[examples[i].label]
            features.append(inputs)

        for i, example in enumerate(examples[:5]):
            logger.info("*** Example ***")
            logger.info("id: %s" % (example.id))
            logger.info("features: %s" % features[i])

        return features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        return self.features[i]


# shallow_classifier

In [3]:
from typing import List, Optional
from gensim import corpora
from gensim import matutils

import numpy as np

#from .utils.dataset import ReflexiveDataset
#from .utils.data_reader import Sentence


class ShallowClassifier:
    word_dictionary = None

    def __init__(self, classifier, use_context: bool, bow_size: int,
                 sentences_dir: Optional[str] = None, lang: str = "cze"):
        self.classifier = classifier
        self.sentences_dir = sentences_dir
        self.use_context = use_context
        self.bow_size = bow_size
        self.lang = lang

    def _preprocess_string(self, text: str) -> List[str]:
        from gensim.parsing import preprocess_string
        if self.lang == "cze":
            from utils.cs_stemmer import cz_stem
            return [cz_stem(word) for word in preprocess_string(text)]
        else:
            return preprocess_string(text)

    def _initialize_bow_model(self, sents: List[Sentence]):
        text_preprocessed = [self._preprocess_string(str(s.text)) for s in sents]
        contexts_preprocessed = [self._preprocess_string(str(s.context)) for s in sents]

        self.word_dictionary = corpora.Dictionary(text_preprocessed + contexts_preprocessed)
        # keep most-occurring 10k words
        # we need to check this with Ullmann
        self.word_dictionary.filter_extremes(keep_n=self.bow_size)

    def _vectorize_sentences(self, sents: List[Sentence]):
        text_preprocessed = [self._preprocess_string(s.text) for s in sents]
        # sparse matrix contains just pairs of co-occurrences
        sparse_matrix = [self.word_dictionary.doc2bow(t) for t in text_preprocessed]
        # we want to get natural, dense vectors for each document, containing the most-frequent num_terms
        dense_matrix = matutils.corpus2dense(sparse_matrix, num_terms=self.bow_size).transpose()
        if not self.use_context:
            return dense_matrix
        else:
            # the same for contextual vectors
            text_preprocessed_c = [self._preprocess_string(str(s.context)) for s in sents]
            # sparse matrix contains just pairs of co-occurrences
            sparse_matrix_c = [self.word_dictionary.doc2bow(t) for t in text_preprocessed_c]
            # we want to get natural, dense vectors for each document, containing the most-frequent num_terms
            dense_matrix_c = matutils.corpus2dense(sparse_matrix_c, num_terms=self.bow_size).transpose()

            # concat textual and contextual vectors horizontally
            return np.hstack([dense_matrix, dense_matrix_c])

    def train(self, in_sentences: List[Sentence] = None, confidence_thrd: int = 5):
        if in_sentences is None:
            # get the dataset from outside
            sentences = ReflexiveDataset.sentences_from_tsv(self.sentences_dir, "train",
                                                            confidence_thrd, self.use_context)
        else:
            # user gets the dataset himself
            sentences = in_sentences

        self._initialize_bow_model(sentences)
        vectors = self._vectorize_sentences(sentences)
        self.classifier.fit(vectors, [s.label for s in in_sentences])

    def predict(self, sentences: List[Sentence]):
        vectors = self._vectorize_sentences(sentences)
        targets = self.classifier.predict(vectors)
        return targets




# train_eval_shallow_classifier

In [4]:
#from reflection_classification.utils.dataset import ReflexiveDataset
#from reflection_classification.shallow_classifier import ShallowClassifier

from sklearn.metrics import f1_score, classification_report
#import argparse


if __name__ == "__main__":
    
    # 하이퍼 파라미터 
    classifier = 'random_forrest' # 분류기 선택 
    sentences_dir = 'reflection-classification/data/sentences/en' # 데이터 위치 
    train_confidence_threshold = 5
    test_confidence_threshold = 5
    use_context = True
    vocabulary_size = 800
    language ='en'
    
    # 분류기 설정 
    if classifier == 'random_forrest':
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier()
    elif classifier == 'logistic_regression':
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(max_iter=10e4)
    elif classifier == 'naive_bayes':
        from sklearn.naive_bayes import MultinomialNB
        classifier = MultinomialNB()
    elif classifier == 'support_vector_classifier':
        from sklearn.svm import SVC
        classifier = SVC()
    else:
        raise ValueError("Unrecognized classifier: %s" % classifier)

    train_sentences = ReflexiveDataset.sentences_from_tsv(sentences_dir, "train",
                                                          train_confidence_threshold, use_context)
    test_sentences = ReflexiveDataset.sentences_from_tsv(sentences_dir, "test",
                                                         test_confidence_threshold, use_context)

    cfr = ShallowClassifier(classifier=classifier, use_context=use_context, bow_size=vocabulary_size,
                            lang=language)
    cfr.train(train_sentences)
    pred_targets = cfr.predict(test_sentences)
    true_targets = [s.label for s in test_sentences]
    objective_val = f1_score(true_targets, pred_targets, average='micro')
    print("Evaluating on %s sentences" % len(test_sentences))
    print("Classification report: \n%s" % classification_report(true_targets, pred_targets))
    print(objective_val)


Evaluating on 149 sentences
Classification report: 
              precision    recall  f1-score   support

      Belief       0.00      0.00      0.00         5
Difficulties       0.00      0.00      0.00         6
  Difficulty       0.00      0.00      0.00         0
  Experience       0.67      0.13      0.22        31
     Feeling       0.91      0.79      0.85        38
    Learning       0.00      0.00      0.00         4
       Other       0.52      0.92      0.67        59
  Reflection       0.80      0.67      0.73         6

    accuracy                           0.62       149
   macro avg       0.36      0.31      0.31       149
weighted avg       0.61      0.62      0.55       149

0.6174496644295302


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# train_utils

In [5]:
#from .dataset import ReflexiveDataset
from dataclasses import dataclass, field
from transformers import EvalPrediction, AutoTokenizer
from sklearn.metrics import f1_score
import numpy as np
from typing import Dict, List


def eval_fscore_acc(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    acc = (preds == p.label_ids).mean()
    f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='micro')
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }


def get_datasets(tokenizer: AutoTokenizer, sentences_dir: str, label_list: List[str],
                 cache_dir: str, use_context: bool, confidence_thrd: int) -> List[ReflexiveDataset]:
    return [ReflexiveDataset(sentences_dir, tokenizer=tokenizer, dataset_type=dataset_type, use_context=use_context,
                             cache_dir=cache_dir, label_list=label_list, mean_confidence_threshold=confidence_thrd)
            for dataset_type in ["train", "val", "test"]]


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/transformer_config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )


# train_neural_classifier 

In [25]:
import argparse

import torch
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForSequenceClassification, Trainer
from transformers import (
    TrainingArguments,
    set_seed,
    EarlyStoppingCallback
)

#from reflection_classification.utils.train_utils import *
#from reflection_classification.utils.dataset import LABELS

# en_gtranslate thrd 5
# Test accuracy: 0.7911392405063291

if __name__ == "__main__":
    # run this from /reflection-classification/reflection_classification
    argparser = argparse.ArgumentParser()
    
    model_name = 'bert-base-multilingual-cased'
    sentences_dir = 'reflection-classification/data/sentences/en'
    trained_model_dir = 'reflection-classification/models/bert-base-en-confidence6+ \ '
    train_confidence_threshold=6
    device = 'cuda' # cuda 환경을 만들어야 함 
    eval_on_test_set = True
    
    argparser.add_argument('--model_name', type=str, help='Model name, or local path to finetune.',
                           required=True)
    argparser.add_argument('--sentences_dir', type=str, help='Directory with .tsvs of annotated sentences',
                           required=True)
    argparser.add_argument('--train_confidence_threshold', type=int,
                           help='Minimal confidence threshold for sentences to train on.',
                           default=5)
    argparser.add_argument('--trained_model_dir', type=str, help='Directory to be filled with trained model',
                           required=True)
    argparser.add_argument('--device', type=str, help='Device used for training. One of {cpu, cuda, cuda:[idx]}',
                           required=True, default="cuda")
    argparser.add_argument('--eval_on_test_set', type=bool, default=True,
                           help='Whether to evaluate model (having lowest eval loss) on test set')
    argparser.add_argument('--use_context', type=bool, help='Whether the model will be trained using context.',
                           default=True)
    

    model_args = ModelArguments(
        model_name_or_path=model_name,
    )

    transformer_config = AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        num_labels=len(LABELS),
        finetuning_task="classification",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, use_fast=False)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        config=transformer_config,
    ).to(device)

    train_dataset, val_dataset, test_dataset = get_datasets(tokenizer,
                                                            cache_dir=trained_model_dir,
                                                            label_list=LABELS,
                                                            sentences_dir=sentences_dir,
                                                            use_context=use_context,
                                                            confidence_thrd=train_confidence_threshold)
    training_args = TrainingArguments(output_dir=trained_model_dir,
                                      overwrite_output_dir=True,
                                      do_train=True,
                                      do_eval=True,
                                      do_predict=True,
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      num_train_epochs=20,
                                      warmup_steps=300,
                                      logging_steps=50,
                                      logging_first_step=True,
                                      evaluation_strategy="steps",
                                      learning_rate=2e-5,
                                      save_total_limit=16,
                                      gradient_accumulation_steps=16,
                                      load_best_model_at_end=True,
                                      no_cuda=True if device == "cpu" else False,
                                      metric_for_best_model="f1")

    set_seed(training_args.seed)

    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=val_dataset,
                      compute_metrics=eval_fscore_acc,
                      tokenizer=tokenizer,
                      callbacks=[EarlyStoppingCallback(early_stopping_patience=10)])

    trainer.train()

    if eval_on_test_set:
        y_pred = [trainer.model(
            **{k: torch.tensor(v).unsqueeze(0).to(trainer.model.device) for k, v in f.items() if k != 'label'},
            return_dict=True).logits.argmax().item() for f in tqdm(test_dataset.features,
                                                                   desc="Evaluating best model on test dataset")]

        y_trues = [f['label'] for f in test_dataset.features]

        y_truepos = [y_trues[i] == y_pred[i] for i, _ in enumerate(y_pred)]

        print("Test accuracy: %s" % (sum(y_truepos) / len(y_truepos)))

    trainer.save_model(trained_model_dir)
    print("Trained model and training checkpoints are saved to %s" % trained_model_dir)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AssertionError: Torch not compiled with CUDA enabled

# neural_classifier

In [13]:
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
#from .utils.dataset import LABELS


class NeuralClassifier:

    def __init__(self, model_path: str, uses_context: bool, device: str):
        self.config = AutoConfig.from_pretrained(model_path)
        self.device = device
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path, config=self.config).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.uses_context = uses_context

    def predict_sentence(self, sentence: str, context: str = None):
        if context is None and self.uses_context:
            raise ValueError("You need to pass in context argument, including the sentence")

        features = self.tokenizer(sentence, text_pair=context,
                                  padding="max_length", truncation=True, return_tensors='pt')
        outputs = self.model(**features.to(self.device), return_dict=True)
        argmax = outputs.logits.argmax(dim=-1).detach().cpu().tolist()[0]
        labels = LABELS[argmax]

        return labels



# eval_neural_classifier

In [24]:
import argparse
from tqdm import tqdm

#from reflection_classification.utils.dataset import ReflexiveDataset
#from reflection_classification.neural_classifier import NeuralClassifier

if __name__ == "__main__":
    #argparser = argparse.ArgumentParser()
    
    sentences_dir='reflection-classification/data/sentences/en'
    trained_model_dir='reflection-classification/models/roberta-large-nouda'
    test_confidence_threshold=5
    device = 'cuda'
    use_context = True
    

    classifier = NeuralClassifier(trained_model_dir, use_context, device)
    test_sentences = ReflexiveDataset.sentences_from_tsv(sentences_dir, "test",
                                                         test_confidence_threshold, use_context)

    y_pred = [classifier.predict_sentence(sentence.text, sentence.context) for sentence in tqdm(test_sentences)]

    y_trues = [sentence.label for sentence in test_sentences]

    y_truepos = [y_trues[i] == y_pred[i] for i, _ in enumerate(y_pred)]

    print("Test accuracy: %s" % (sum(y_truepos) / len(y_truepos)))


AssertionError: Torch not compiled with CUDA enabled