In [1]:
import torch
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import nn
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader
from torch import nn
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

"""
# 从 sentence embedding. 2. 对比 k-means 效果 3. 看一看sentiment analyze效果
# 3. 目的：没有label并且任务复杂继续数据清洗（entertaining, jokes）
# 4. todo... text style detection?
# 5. 区分是不是一个人所写的内容：plagirithm detection
# 6. 还有什么别的方向？

"""


class TextDataset(Dataset):

    def __init__(self, filepath, normal_class=0, method='tokenizer'):
        super().__init__()
        
         # define device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.normal_classes = tuple([normal_class])
        self.n_classes = 2  # 0: normal, 1: outlier

        # Load your dataset
        df = pd.read_csv(filepath)
        df['label'] = df['label'].apply(lambda x: int(x in self.normal_classes))

        # Tokenizer
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        self.model = RobertaModel.from_pretrained('roberta-large').to(self.device)
        # Split the dataset into train and test
        self.train_df, self.test_df = train_test_split(df, test_size=0.2, random_state=42)
        
        self.sentence_embedding_model = SentenceTransformer('all-MiniLM-L6-v2').to(self.device)
        
        # Create datasets
        self.train_set = self.create_dataset(self.train_df, method=method)
        self.test_set = self.create_dataset(self.test_df, method=method)


    def create_dataset(self, df, method='tokenizer'):
        # Convert texts and labels into tensors
        texts = df['text'].tolist()
        labels = df['label'].tolist()

        if method == 'tokenizer':
            # Tokenize the texts
            inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        elif method == 'model-embedding':
            # Tokenize the texts and convert them to embeddings
            inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(self.device)
            with torch.no_grad():
                inputs = {'input_ids': self.model(**inputs).last_hidden_state}
        elif method == 'sentence-embedding':
            # Convert sentences to embeddings
            with torch.no_grad():
                inputs = self.sentence_embedding_model.encode(texts, convert_to_tensor=True)
                inputs = {'input_ids': inputs}
        else:
            raise ValueError('Invalid method: choose either "tokenizer", "model-embedding" or "sentence-embedding"')

        # Return a dictionary with inputs and labels
        dataset = {'inputs': inputs, 'labels': torch.tensor(labels).to(self.device)}
        return dataset

    def __getitem__(self, index):
        inputs = {key: val[index] for key, val in self.train_set['inputs'].items()}
        label = self.train_set['labels'][index]
        return inputs, label, index

    def __len__(self):
        return len(self.train_set['inputs']['input_ids'])


In [5]:
# Instantiate the class with the file path of your dataset
dataset_sentence_embedding = TextDataset(filepath='../data/text_demo/processed_shuffled_outliers_sport.csv', normal_class=0, method='sentence-embedding')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
dataset_sentence_embedding[6][0]['input_ids'].shape

torch.Size([384])

In [7]:
dataset_model_embedding = TextDataset(filepath='../data/text_demo/processed_shuffled_outliers_sport.csv', normal_class=0, method='model-embedding')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
dataset_model_embedding[7][0]['input_ids'].shape

torch.Size([27, 1024])