In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Dummy dataset with codes and multiple sub-codes
data = {'Text': ['qualitative data analysis involves organizing and categorizing data',
                 'coding is an essential activity of the qualitative research process',
                 'thematic coding helps identify patterns and themes within the data',
                 'axial coding explores relationships between codes',
                 'selective coding involves refining and integrating categories',
                 'coding procedure is a vital and intricate aspect of qualitative research'],
        'Category': ['Open Coding', 'Open Coding', 'Thematic Coding', 'Axial Coding', 'Selective Coding', 'Coding Procedure'],
        'SubCategory': [['Exploration', 'Analysis'], ['Process', 'Execution'], ['Pattern Identification', 'Theme Extraction'], ['Relationship Exploration'], ['Refinement', 'Integration'], ['Execution', 'Analysis']]}
df = pd.DataFrame(data)

In [5]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.head()

Unnamed: 0,Text,Category,SubCategory
5,coding procedure is a vital and intricate aspe...,Coding Procedure,"[Execution, Analysis]"
2,thematic coding helps identify patterns and th...,Thematic Coding,"[Pattern Identification, Theme Extraction]"
4,selective coding involves refining and integra...,Selective Coding,"[Refinement, Integration]"
3,axial coding explores relationships between codes,Axial Coding,[Relationship Exploration]


In [6]:
# Load a pre-trained language model and tokenizer
model_name = "bert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['Category'].unique()))


tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 6.34kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 298kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 876kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.46MB/s]
model.safetensors: 100%|██████████| 440M/440M [01:14<00:00, 5.90MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Tokenize and prepare data for the language model
class CustomDataset(Dataset):
    def __init__(self, texts, categories, sub_categories, tokenizer, max_len=128):
        self.texts = texts
        self.categories = categories
        self.sub_categories = sub_categories
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        category = self.categories[idx]
        sub_category = self.sub_categories[idx]

        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, truncation=True)
        print(category)
        print(sub_category)
        inputs['labels'] = torch.tensor(self.label2id([category]) + self.label2id(sub_category[:2]), dtype=torch.long)

        return inputs
    
    def label2id(self, label):
        ids = [i for i, c in enumerate(label)]
        return ids#self.categories.index(label)

In [37]:
train_dataset = CustomDataset(train_df['Text'], train_df['Category'], train_df['SubCategory'], tokenizer)
val_dataset = CustomDataset(val_df['Text'], val_df['Category'], val_df['SubCategory'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)


In [38]:
# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [39]:
for epoch in range(1):  
    model.train()
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # model.eval()
    # all_preds = []
    # all_labels = []
    # with torch.no_grad():
    #     for batch in val_loader:
    #         inputs = {k: v.to(device) for k, v in batch.items()}
    #         outputs = model(**inputs)
    #         logits = outputs.logits
    #         preds = torch.argmax(logits, dim=1).cpu().numpy()
    #         labels = inputs['labels'][:, 0].cpu().numpy()
    #         all_preds.extend(preds)
    #         all_labels.extend(labels)

    # val_accuracy = sum([p == l for p, l in zip(all_preds, all_labels)]) / len(all_preds)
    # print(f"Epoch {epoch + 1}, Validation Accuracy: {val_accuracy:.4f}")


KeyError: 1