In [1]:
dstype = 'dbp' 
mname = 'debertaV3'

In [2]:
import pandas as pd

In [3]:
# %%
# %%
import torch
import torchvision

# Check PyTorch version
print("PyTorch version:", torch.__version__)

# Check if CUDA is available (GPU support)
print("CUDA available:", torch.cuda.is_available())

# Check the number of GPUs
print("Number of GPUs:", torch.cuda.device_count())



# %%-----------------------------------++++++++++++++++++++++++++---------------------------------------

# %%
# %%
modelpath = 'microsoft/deberta-v3-base'
# modelpath = "bert-base-uncased"


datapath = None
saveDIR = f"/home/bhairavi/om/om5/{dstype}/{mname}_{dstype}"
print(saveDIR)
# %%


PyTorch version: 2.4.0+cu121
CUDA available: True
Number of GPUs: 1
/home/bhairavi/om/om5/dbp/debertaV3_dbp


In [4]:
modelpath  = saveDIR

In [5]:


# %%
from datasets import load_dataset

dataset = load_dataset("DeveloperOats/DBPedia_Classes", name='default' )


# %%
dataset


  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'l1', 'l2', 'l3'],
        num_rows: 240942
    })
    validation: Dataset({
        features: ['text', 'l1', 'l2', 'l3'],
        num_rows: 36003
    })
    test: Dataset({
        features: ['text', 'l1', 'l2', 'l3'],
        num_rows: 60794
    })
})

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'l1', 'l2', 'l3'],
        num_rows: 240942
    })
    validation: Dataset({
        features: ['text', 'l1', 'l2', 'l3'],
        num_rows: 36003
    })
    test: Dataset({
        features: ['text', 'l1', 'l2', 'l3'],
        num_rows: 60794
    })
})

In [7]:
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
validation_df = pd.DataFrame(dataset['validation'])

In [8]:
train_df['split'] = 'train'

test_df['split'] = 'test'

validation_df['split'] = 'validation'

In [9]:
df = pd.concat([train_df, test_df, validation_df], ignore_index=True)

In [10]:
df.shape

(337739, 5)

In [11]:
df.columns

Index(['text', 'l1', 'l2', 'l3', 'split'], dtype='object')

In [12]:
 

# %%
df['l3'].nunique()

# %%


219

In [13]:

# %%
df['label'] = df['l3']


In [14]:

# %%
from sklearn.preprocessing import LabelEncoder

# %%
le = LabelEncoder()
df['target'] = le.fit_transform(df['label'])


In [15]:

# %%

# %%
df.columns

# %%


Index(['text', 'l1', 'l2', 'l3', 'split', 'label', 'target'], dtype='object')

In [16]:
df.shape

# %%


(337739, 7)

In [17]:

  
 
import os
import torch 
os.environ["CUDA_VISIBLE_DEVICES"] = "2" 

torch.cuda.empty_cache() 

import pandas as pd
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
 

# %%
numlabel = df['target'].nunique()
numlabel


219

In [18]:


# %%

# %%
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = "cuda"  # the device to load the model onto

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(modelpath)
model = AutoModelForSequenceClassification.from_pretrained(modelpath, num_labels=numlabel)

# Move the model to the specified device
model.to(device)


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [19]:


# %%
# df['token_length'] = df['text'].apply(lambda x: len(tokenizer.tokenize(x)))

# # Calculate the maximum token length
# max_length = df['token_length'].max()
# Calculate the token length using the split method
df['token_length'] = df['text'].apply(lambda x: len(x.split()))

# Calculate the maximum token length
max_length = df['token_length'].max()



In [20]:
max_length

499

In [21]:

# Calculate the next maximum token length
next_max_token_length = df['token_length'].nlargest(2).iloc[1] 


# Calculate the average token length
average_token_length = df['token_length'].mean()

# Display the results
print(f"Maximum token length: {max_length}")
print(f"Next maximum token length: {next_max_token_length}") 
print(f"Average token length: {average_token_length:.2f}")

# %%
min(df['token_length'])


Maximum token length: 499
Next maximum token length: 499
Average token length: 102.76


11

In [22]:

# %%
fdf = df[df['token_length'] == 5]

# %%
fdf

# %%
df = df[df['token_length'] >= 5]

# %%
df.shape
 


(337739, 8)

In [23]:
df.columns

Index(['text', 'l1', 'l2', 'l3', 'split', 'label', 'target', 'token_length'], dtype='object')

In [24]:
train_df = df[df['split'] == 'train'].drop(columns=['split'])

test_df = df[df['split'] == 'test'].drop(columns=['split'])

val_df = df[df['split'] == 'validation'].drop(columns=['split'])

In [25]:
train_df.shape, test_df.shape, val_df.shape

((240942, 7), (60794, 7), (36003, 7))

In [26]:

def tokenize_and_format(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=max_length)
    tokenized_inputs['label'] = list(map(int, examples['target']))
    return tokenized_inputs





# %%
# Convert pandas DataFrame to Hugging Face's Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(val_df) 
test_dataset = Dataset.from_pandas(test_df)

# Map the tokenization function across the datasets
train_dataset = train_dataset.map(tokenize_and_format, batched=True,batch_size=16)
eval_dataset = eval_dataset.map(tokenize_and_format, batched=True,batch_size=16) 
test_dataset = test_dataset.map(tokenize_and_format, batched=True,batch_size=16)


# %%
 

# %%
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'eval_f1': f1,
        'eval_precision': precision,
        'eval_recall': recall,
    }



Map: 100%|██████████| 240942/240942 [02:29<00:00, 1616.33 examples/s]
Map: 100%|██████████| 36003/36003 [00:22<00:00, 1598.12 examples/s]
Map: 100%|██████████| 60794/60794 [00:38<00:00, 1596.06 examples/s]


In [27]:
test_df = test_df.reset_index(drop=True)

In [28]:
df

Unnamed: 0,text,l1,l2,l3,split,label,target,token_length
0,"William Alexander Massey (October 7, 1856 – Ma...",Agent,Politician,Senator,train,Senator,185,251
1,Lions is the sixth studio album by American ro...,Work,MusicalWork,Album,train,Album,4,204
2,"Pirqa (Aymara and Quechua for wall, hispaniciz...",Place,NaturalPlace,Mountain,train,Mountain,132,56
3,Cancer Prevention Research is a biweekly peer-...,Work,PeriodicalLiterature,AcademicJournal,train,AcademicJournal,0,52
4,The Princeton University Chapel is located on ...,Place,Building,HistoricBuilding,train,HistoricBuilding,98,252
...,...,...,...,...,...,...,...,...
337734,The Great Pershing Balloon Derby is a hot air ...,Event,SocietalEvent,Convention,validation,Convention,56,90
337735,Microsystems was a personal computing magazine...,Work,PeriodicalLiterature,Magazine,validation,Magazine,118,118
337736,The 1899 Open Championship was the 39th Open C...,Event,Tournament,GolfTournament,validation,GolfTournament,89,411
337737,Kristina Repelewska (born 7 January 1981) is a...,Agent,Athlete,HandballPlayer,validation,HandballPlayer,95,37


In [29]:
 
# import pandas as pd
# import numpy as np
# import submodlib
# from datasets import Dataset 


# def select_representative_samples(tokenized_data, num_samples_per_class):
#     selected_indices = []
     
#     for class_label in set(tokenized_data['label']):
#         print(f"Processing class: {class_label}")
#         class_indices = [i for i, label in enumerate(tokenized_data['label']) if label == class_label]
#         X_class = np.array(tokenized_data['input_ids'])[class_indices]
 
#         similarity_kernel = np.dot(X_class, X_class.T)
 
#         facility_location_function = submodlib.FacilityLocationFunction(n=len(X_class), mode="dense", sijs=similarity_kernel)

         
#         selected = facility_location_function.maximize(budget=num_samples_per_class, optimizer='NaiveGreedy')
 
#         selected_indices.extend([class_indices[i] for i in selected])

#     return selected_indices
 
 
# num_samples_per_class = 1000 // len(set(test_dataset['label']))   
# selected_indices = select_representative_samples(test_dataset, num_samples_per_class)

 
# test_dataset_subset = test_dataset.select(selected_indices)

# print(test_dataset_subset)


In [30]:
print("""Data Preparation:

The text data is first extracted and then transformed into numerical features using TF-IDF vectorization. This process converts the text into a matrix of features where each feature represents the importance of a word in the document relative to the entire corpus.
Clustering for Representative Sample Selection:

The goal is to select a representative subset of samples for each class. To achieve this, the data is first clustered using the K-Means algorithm, which groups the data into a specified number of clusters.
Selecting Closest Points to Cluster Centroids:

For each cluster, the data points closest to the centroid (center) of the cluster are identified. These points are considered the most representative samples of the data within that cluster.""")

Data Preparation:

The text data is first extracted and then transformed into numerical features using TF-IDF vectorization. This process converts the text into a matrix of features where each feature represents the importance of a word in the document relative to the entire corpus.
Clustering for Representative Sample Selection:

The goal is to select a representative subset of samples for each class. To achieve this, the data is first clustered using the K-Means algorithm, which groups the data into a specified number of clusters.
Selecting Closest Points to Cluster Centroids:

For each cluster, the data points closest to the centroid (center) of the cluster are identified. These points are considered the most representative samples of the data within that cluster.


submodelib fro each class

In [31]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from apricot import FacilityLocationSelection
from datasets import Dataset

 

# Step 3: Define a function to select representative samples for each class using apricot
def select_representative_samples(tokenized_data, num_samples_per_class):
    selected_indices = []
    
    # Iterate over each unique class label
    for class_label in set(tokenized_data['label']):
        print(f"Processing class: {class_label}")
        class_indices = [i for i, label in enumerate(tokenized_data['label']) if label == class_label]
        X_class = np.array(tokenized_data['input_ids'])[class_indices]

        # Apply apricot's Facility Location Selection
        selector = FacilityLocationSelection(n_samples=num_samples_per_class, metric='euclidean', verbose=True)
        selector.fit(X_class)
        selected = selector.ranking

        # Collect the selected indices
        selected_indices.extend([class_indices[i] for i in selected])

    return selected_indices

 
 
num_samples_per_class = 1000 // len(set(test_dataset['label']))   
selected_indices = select_representative_samples(test_dataset, num_samples_per_class)

 
test_dataset_subset = test_dataset.select(selected_indices)

print(test_dataset_subset)



Processing class: 0


100%|██████████| 4.00/4.00 [00:00<00:00, 680it/s]


Processing class: 1


100%|██████████| 4.00/4.00 [00:00<00:00, 1.32kit/s]


Processing class: 2


100%|██████████| 4.00/4.00 [00:00<00:00, 677it/s]


Processing class: 3


100%|██████████| 4.00/4.00 [00:00<00:00, 716it/s]


Processing class: 4


100%|██████████| 4.00/4.00 [00:00<00:00, 727it/s]


Processing class: 5


100%|██████████| 4.00/4.00 [00:00<00:00, 3.46kit/s]


Processing class: 6


100%|██████████| 4.00/4.00 [00:00<00:00, 2.94kit/s]


Processing class: 7


100%|██████████| 4.00/4.00 [00:00<00:00, 683it/s]


Processing class: 8


100%|██████████| 4.00/4.00 [00:00<00:00, 672it/s]


Processing class: 9


100%|██████████| 4.00/4.00 [00:00<00:00, 5.08kit/s]


Processing class: 10


100%|██████████| 4.00/4.00 [00:00<00:00, 1.43kit/s]


Processing class: 11


100%|██████████| 4.00/4.00 [00:00<00:00, 876it/s]


Processing class: 12


100%|██████████| 4.00/4.00 [00:00<00:00, 754it/s]


Processing class: 13


100%|██████████| 4.00/4.00 [00:00<00:00, 861it/s]


Processing class: 14


100%|██████████| 4.00/4.00 [00:00<00:00, 535it/s]


Processing class: 15


100%|██████████| 4.00/4.00 [00:00<00:00, 2.87kit/s]


Processing class: 16


100%|██████████| 4.00/4.00 [00:00<00:00, 3.44kit/s]


Processing class: 17


100%|██████████| 4.00/4.00 [00:00<00:00, 707it/s]


Processing class: 18


100%|██████████| 4.00/4.00 [00:00<00:00, 3.83kit/s]


Processing class: 19


100%|██████████| 4.00/4.00 [00:00<00:00, 1.46kit/s]


Processing class: 20


100%|██████████| 4.00/4.00 [00:00<00:00, 723it/s]


Processing class: 21


100%|██████████| 4.00/4.00 [00:00<00:00, 590it/s]


Processing class: 22


100%|██████████| 4.00/4.00 [00:00<00:00, 2.48kit/s]


Processing class: 23


100%|██████████| 4.00/4.00 [00:00<00:00, 4.53kit/s]


Processing class: 24


100%|██████████| 4.00/4.00 [00:00<00:00, 701it/s]


Processing class: 25


100%|██████████| 4.00/4.00 [00:00<00:00, 5.38kit/s]


Processing class: 26


100%|██████████| 4.00/4.00 [00:00<00:00, 3.17kit/s]


Processing class: 27


100%|██████████| 4.00/4.00 [00:00<00:00, 625it/s]


Processing class: 28


100%|██████████| 4.00/4.00 [00:00<00:00, 1.17kit/s]


Processing class: 29


100%|██████████| 4.00/4.00 [00:00<00:00, 5.71kit/s]


Processing class: 30


100%|██████████| 4.00/4.00 [00:00<00:00, 809it/s]


Processing class: 31


100%|██████████| 4.00/4.00 [00:00<00:00, 5.20kit/s]


Processing class: 32


100%|██████████| 4.00/4.00 [00:00<00:00, 699it/s]


Processing class: 33


100%|██████████| 4.00/4.00 [00:00<00:00, 5.21kit/s]


Processing class: 34


100%|██████████| 4.00/4.00 [00:00<00:00, 5.03kit/s]


Processing class: 35


100%|██████████| 4.00/4.00 [00:00<00:00, 678it/s]


Processing class: 36


100%|██████████| 4.00/4.00 [00:00<00:00, 1.61kit/s]


Processing class: 37


100%|██████████| 4.00/4.00 [00:00<00:00, 1.44kit/s]


Processing class: 38


100%|██████████| 4.00/4.00 [00:00<00:00, 2.57kit/s]


Processing class: 39


100%|██████████| 4.00/4.00 [00:00<00:00, 5.79kit/s]


Processing class: 40


100%|██████████| 4.00/4.00 [00:00<00:00, 3.89kit/s]


Processing class: 41


100%|██████████| 4.00/4.00 [00:00<00:00, 3.14kit/s]


Processing class: 42


100%|██████████| 4.00/4.00 [00:00<00:00, 2.49kit/s]


Processing class: 43


100%|██████████| 4.00/4.00 [00:00<00:00, 1.57kit/s]


Processing class: 44


100%|██████████| 4.00/4.00 [00:00<00:00, 3.37kit/s]


Processing class: 45


100%|██████████| 4.00/4.00 [00:00<00:00, 3.12kit/s]


Processing class: 46


100%|██████████| 4.00/4.00 [00:00<00:00, 1.35kit/s]


Processing class: 47


100%|██████████| 4.00/4.00 [00:00<00:00, 722it/s]


Processing class: 48


100%|██████████| 4.00/4.00 [00:00<00:00, 4.77kit/s]


Processing class: 49


100%|██████████| 4.00/4.00 [00:00<00:00, 2.78kit/s]


Processing class: 50


100%|██████████| 4.00/4.00 [00:00<00:00, 717it/s]


Processing class: 51


100%|██████████| 4.00/4.00 [00:00<00:00, 1.61kit/s]


Processing class: 52


100%|██████████| 4.00/4.00 [00:00<00:00, 4.18kit/s]


Processing class: 53


100%|██████████| 4.00/4.00 [00:00<00:00, 761it/s]


Processing class: 54


100%|██████████| 4.00/4.00 [00:00<00:00, 639it/s]


Processing class: 55


100%|██████████| 4.00/4.00 [00:00<00:00, 2.51kit/s]


Processing class: 56


100%|██████████| 4.00/4.00 [00:00<00:00, 941it/s]


Processing class: 57


100%|██████████| 4.00/4.00 [00:00<00:00, 5.36kit/s]


Processing class: 58


100%|██████████| 4.00/4.00 [00:00<00:00, 2.64kit/s]


Processing class: 59


100%|██████████| 4.00/4.00 [00:00<00:00, 755it/s]


Processing class: 60


100%|██████████| 4.00/4.00 [00:00<00:00, 782it/s]


Processing class: 61


100%|██████████| 4.00/4.00 [00:00<00:00, 1.03kit/s]


Processing class: 62


100%|██████████| 4.00/4.00 [00:00<00:00, 2.44kit/s]


Processing class: 63


100%|██████████| 4.00/4.00 [00:00<00:00, 5.66kit/s]


Processing class: 64


100%|██████████| 4.00/4.00 [00:00<00:00, 2.18kit/s]


Processing class: 65


100%|██████████| 4.00/4.00 [00:00<00:00, 3.64kit/s]


Processing class: 66


100%|██████████| 4.00/4.00 [00:00<00:00, 601it/s]


Processing class: 67


100%|██████████| 4.00/4.00 [00:00<00:00, 704it/s]


Processing class: 68


100%|██████████| 4.00/4.00 [00:00<00:00, 3.28kit/s]


Processing class: 69


100%|██████████| 4.00/4.00 [00:00<00:00, 705it/s]


Processing class: 70


100%|██████████| 4.00/4.00 [00:00<00:00, 2.35kit/s]


Processing class: 71


100%|██████████| 4.00/4.00 [00:00<00:00, 1.76kit/s]


Processing class: 72


100%|██████████| 4.00/4.00 [00:00<00:00, 737it/s]


Processing class: 73


100%|██████████| 4.00/4.00 [00:00<00:00, 2.45kit/s]


Processing class: 74


100%|██████████| 4.00/4.00 [00:00<00:00, 3.87kit/s]


Processing class: 75


100%|██████████| 4.00/4.00 [00:00<00:00, 1.63kit/s]


Processing class: 76


100%|██████████| 4.00/4.00 [00:00<00:00, 2.60kit/s]


Processing class: 77


100%|██████████| 4.00/4.00 [00:00<00:00, 1.86kit/s]


Processing class: 78


100%|██████████| 4.00/4.00 [00:00<00:00, 622it/s]


Processing class: 79


100%|██████████| 4.00/4.00 [00:00<00:00, 1.80kit/s]


Processing class: 80


100%|██████████| 4.00/4.00 [00:00<00:00, 704it/s]


Processing class: 81


100%|██████████| 4.00/4.00 [00:00<00:00, 691it/s]


Processing class: 82


100%|██████████| 4.00/4.00 [00:00<00:00, 2.53kit/s]


Processing class: 83


100%|██████████| 4.00/4.00 [00:00<00:00, 726it/s]


Processing class: 84


100%|██████████| 4.00/4.00 [00:00<00:00, 684it/s]


Processing class: 85


100%|██████████| 4.00/4.00 [00:00<00:00, 1.94kit/s]


Processing class: 86


100%|██████████| 4.00/4.00 [00:00<00:00, 2.75kit/s]


Processing class: 87


100%|██████████| 4.00/4.00 [00:00<00:00, 4.36kit/s]


Processing class: 88


100%|██████████| 4.00/4.00 [00:00<00:00, 684it/s]


Processing class: 89


100%|██████████| 4.00/4.00 [00:00<00:00, 1.12kit/s]


Processing class: 90


100%|██████████| 4.00/4.00 [00:00<00:00, 760it/s]


Processing class: 91


100%|██████████| 4.00/4.00 [00:00<00:00, 1.55kit/s]


Processing class: 92


100%|██████████| 4.00/4.00 [00:00<00:00, 4.21kit/s]


Processing class: 93


100%|██████████| 4.00/4.00 [00:00<00:00, 4.19kit/s]


Processing class: 94


100%|██████████| 4.00/4.00 [00:00<00:00, 631it/s]


Processing class: 95


100%|██████████| 4.00/4.00 [00:00<00:00, 833it/s]


Processing class: 96


100%|██████████| 4.00/4.00 [00:00<00:00, 3.98kit/s]


Processing class: 97


100%|██████████| 4.00/4.00 [00:00<00:00, 2.69kit/s]


Processing class: 98


100%|██████████| 4.00/4.00 [00:00<00:00, 677it/s]


Processing class: 99


100%|██████████| 4.00/4.00 [00:00<00:00, 833it/s]


Processing class: 100


100%|██████████| 4.00/4.00 [00:00<00:00, 1.24kit/s]


Processing class: 101


100%|██████████| 4.00/4.00 [00:00<00:00, 817it/s]


Processing class: 102


100%|██████████| 4.00/4.00 [00:00<00:00, 2.86kit/s]


Processing class: 103


100%|██████████| 4.00/4.00 [00:00<00:00, 4.90kit/s]


Processing class: 104


100%|██████████| 4.00/4.00 [00:00<00:00, 629it/s]


Processing class: 105


100%|██████████| 4.00/4.00 [00:00<00:00, 1.60kit/s]


Processing class: 106


100%|██████████| 4.00/4.00 [00:00<00:00, 4.37kit/s]


Processing class: 107


100%|██████████| 4.00/4.00 [00:00<00:00, 694it/s]


Processing class: 108


100%|██████████| 4.00/4.00 [00:00<00:00, 749it/s]


Processing class: 109


100%|██████████| 4.00/4.00 [00:00<00:00, 3.15kit/s]


Processing class: 110


100%|██████████| 4.00/4.00 [00:00<00:00, 1.32kit/s]


Processing class: 111


100%|██████████| 4.00/4.00 [00:00<00:00, 722it/s]


Processing class: 112


100%|██████████| 4.00/4.00 [00:00<00:00, 3.83kit/s]


Processing class: 113


100%|██████████| 4.00/4.00 [00:00<00:00, 698it/s]


Processing class: 114


100%|██████████| 4.00/4.00 [00:00<00:00, 3.42kit/s]


Processing class: 115


100%|██████████| 4.00/4.00 [00:00<00:00, 1.08kit/s]


Processing class: 116


100%|██████████| 4.00/4.00 [00:00<00:00, 1.97kit/s]


Processing class: 117


100%|██████████| 4.00/4.00 [00:00<00:00, 993it/s]


Processing class: 118


100%|██████████| 4.00/4.00 [00:00<00:00, 725it/s]


Processing class: 119


100%|██████████| 4.00/4.00 [00:00<00:00, 637it/s]


Processing class: 120


100%|██████████| 4.00/4.00 [00:00<00:00, 646it/s]


Processing class: 121


100%|██████████| 4.00/4.00 [00:00<00:00, 1.18kit/s]


Processing class: 122


100%|██████████| 4.00/4.00 [00:00<00:00, 3.33kit/s]


Processing class: 123


100%|██████████| 4.00/4.00 [00:00<00:00, 716it/s]


Processing class: 124


100%|██████████| 4.00/4.00 [00:00<00:00, 753it/s]


Processing class: 125


100%|██████████| 4.00/4.00 [00:00<00:00, 711it/s]


Processing class: 126


100%|██████████| 4.00/4.00 [00:00<00:00, 706it/s]


Processing class: 127


100%|██████████| 4.00/4.00 [00:00<00:00, 2.38kit/s]


Processing class: 128


100%|██████████| 4.00/4.00 [00:00<00:00, 1.30kit/s]


Processing class: 129


100%|██████████| 4.00/4.00 [00:00<00:00, 573it/s]


Processing class: 130


100%|██████████| 4.00/4.00 [00:00<00:00, 745it/s]


Processing class: 131


100%|██████████| 4.00/4.00 [00:00<00:00, 3.64kit/s]


Processing class: 132


100%|██████████| 4.00/4.00 [00:00<00:00, 703it/s]


Processing class: 133


100%|██████████| 4.00/4.00 [00:00<00:00, 1.81kit/s]


Processing class: 134


100%|██████████| 4.00/4.00 [00:00<00:00, 807it/s]


Processing class: 135


100%|██████████| 4.00/4.00 [00:00<00:00, 721it/s]


Processing class: 136


100%|██████████| 4.00/4.00 [00:00<00:00, 4.04kit/s]


Processing class: 137


100%|██████████| 4.00/4.00 [00:00<00:00, 1.70kit/s]


Processing class: 138


100%|██████████| 4.00/4.00 [00:00<00:00, 1.51kit/s]


Processing class: 139


100%|██████████| 4.00/4.00 [00:00<00:00, 2.48kit/s]


Processing class: 140


100%|██████████| 4.00/4.00 [00:00<00:00, 698it/s]


Processing class: 141


100%|██████████| 4.00/4.00 [00:00<00:00, 2.00kit/s]


Processing class: 142


100%|██████████| 4.00/4.00 [00:00<00:00, 552it/s]


Processing class: 143


100%|██████████| 4.00/4.00 [00:00<00:00, 4.67kit/s]


Processing class: 144


100%|██████████| 4.00/4.00 [00:00<00:00, 709it/s]


Processing class: 145


100%|██████████| 4.00/4.00 [00:00<00:00, 707it/s]


Processing class: 146


100%|██████████| 4.00/4.00 [00:00<00:00, 746it/s]


Processing class: 147


100%|██████████| 4.00/4.00 [00:00<00:00, 699it/s]


Processing class: 148


100%|██████████| 4.00/4.00 [00:00<00:00, 853it/s]


Processing class: 149


100%|██████████| 4.00/4.00 [00:00<00:00, 1.12kit/s]


Processing class: 150


100%|██████████| 4.00/4.00 [00:00<00:00, 3.87kit/s]


Processing class: 151


100%|██████████| 4.00/4.00 [00:00<00:00, 674it/s]


Processing class: 152


100%|██████████| 4.00/4.00 [00:00<00:00, 1.04kit/s]


Processing class: 153


100%|██████████| 4.00/4.00 [00:00<00:00, 4.25kit/s]


Processing class: 154


100%|██████████| 4.00/4.00 [00:00<00:00, 4.38kit/s]


Processing class: 155


100%|██████████| 4.00/4.00 [00:00<00:00, 4.87kit/s]


Processing class: 156


100%|██████████| 4.00/4.00 [00:00<00:00, 2.76kit/s]


Processing class: 157


100%|██████████| 4.00/4.00 [00:00<00:00, 731it/s]


Processing class: 158


100%|██████████| 4.00/4.00 [00:00<00:00, 4.13kit/s]


Processing class: 159


100%|██████████| 4.00/4.00 [00:00<00:00, 919it/s]


Processing class: 160


100%|██████████| 4.00/4.00 [00:00<00:00, 1.40kit/s]


Processing class: 161


100%|██████████| 4.00/4.00 [00:00<00:00, 1.53kit/s]


Processing class: 162


100%|██████████| 4.00/4.00 [00:00<00:00, 1.11kit/s]


Processing class: 163


100%|██████████| 4.00/4.00 [00:00<00:00, 1.44kit/s]


Processing class: 164


100%|██████████| 4.00/4.00 [00:00<00:00, 668it/s]


Processing class: 165


100%|██████████| 4.00/4.00 [00:00<00:00, 5.16kit/s]


Processing class: 166


100%|██████████| 4.00/4.00 [00:00<00:00, 4.02kit/s]


Processing class: 167


100%|██████████| 4.00/4.00 [00:00<00:00, 719it/s]


Processing class: 168


100%|██████████| 4.00/4.00 [00:00<00:00, 690it/s]


Processing class: 169


100%|██████████| 4.00/4.00 [00:00<00:00, 880it/s]


Processing class: 170


100%|██████████| 4.00/4.00 [00:00<00:00, 708it/s]


Processing class: 171


100%|██████████| 4.00/4.00 [00:00<00:00, 1.87kit/s]


Processing class: 172


100%|██████████| 4.00/4.00 [00:00<00:00, 693it/s]


Processing class: 173


100%|██████████| 4.00/4.00 [00:00<00:00, 1.80kit/s]


Processing class: 174


100%|██████████| 4.00/4.00 [00:00<00:00, 700it/s]


Processing class: 175


100%|██████████| 4.00/4.00 [00:00<00:00, 716it/s]


Processing class: 176


100%|██████████| 4.00/4.00 [00:00<00:00, 4.33kit/s]


Processing class: 177


100%|██████████| 4.00/4.00 [00:00<00:00, 2.48kit/s]


Processing class: 178


100%|██████████| 4.00/4.00 [00:00<00:00, 4.62kit/s]


Processing class: 179


100%|██████████| 4.00/4.00 [00:00<00:00, 944it/s]


Processing class: 180


100%|██████████| 4.00/4.00 [00:00<00:00, 2.89kit/s]


Processing class: 181


100%|██████████| 4.00/4.00 [00:00<00:00, 659it/s]


Processing class: 182


100%|██████████| 4.00/4.00 [00:00<00:00, 710it/s]


Processing class: 183


100%|██████████| 4.00/4.00 [00:00<00:00, 722it/s]


Processing class: 184


100%|██████████| 4.00/4.00 [00:00<00:00, 2.69kit/s]


Processing class: 185


100%|██████████| 4.00/4.00 [00:00<00:00, 2.39kit/s]


Processing class: 186


100%|██████████| 4.00/4.00 [00:00<00:00, 550it/s]


Processing class: 187


100%|██████████| 4.00/4.00 [00:00<00:00, 666it/s]


Processing class: 188


100%|██████████| 4.00/4.00 [00:00<00:00, 3.10kit/s]


Processing class: 189


100%|██████████| 4.00/4.00 [00:00<00:00, 769it/s]


Processing class: 190


100%|██████████| 4.00/4.00 [00:00<00:00, 809it/s]


Processing class: 191


100%|██████████| 4.00/4.00 [00:00<00:00, 696it/s]


Processing class: 192


100%|██████████| 4.00/4.00 [00:00<00:00, 1.03kit/s]


Processing class: 193


100%|██████████| 4.00/4.00 [00:00<00:00, 637it/s]


Processing class: 194


100%|██████████| 4.00/4.00 [00:00<00:00, 616it/s]


Processing class: 195


100%|██████████| 4.00/4.00 [00:00<00:00, 716it/s]


Processing class: 196


100%|██████████| 4.00/4.00 [00:00<00:00, 3.24kit/s]


Processing class: 197


100%|██████████| 4.00/4.00 [00:00<00:00, 2.63kit/s]


Processing class: 198


100%|██████████| 4.00/4.00 [00:00<00:00, 2.94kit/s]


Processing class: 199


100%|██████████| 4.00/4.00 [00:00<00:00, 3.62kit/s]


Processing class: 200


100%|██████████| 4.00/4.00 [00:00<00:00, 706it/s]


Processing class: 201


100%|██████████| 4.00/4.00 [00:00<00:00, 3.08kit/s]


Processing class: 202


100%|██████████| 4.00/4.00 [00:00<00:00, 738it/s]


Processing class: 203


100%|██████████| 4.00/4.00 [00:00<00:00, 629it/s]


Processing class: 204


100%|██████████| 4.00/4.00 [00:00<00:00, 3.40kit/s]


Processing class: 205


100%|██████████| 4.00/4.00 [00:00<00:00, 717it/s]


Processing class: 206


100%|██████████| 4.00/4.00 [00:00<00:00, 658it/s]


Processing class: 207


100%|██████████| 4.00/4.00 [00:00<00:00, 1.73kit/s]


Processing class: 208


100%|██████████| 4.00/4.00 [00:00<00:00, 2.38kit/s]


Processing class: 209


100%|██████████| 4.00/4.00 [00:00<00:00, 698it/s]


Processing class: 210


100%|██████████| 4.00/4.00 [00:00<00:00, 1.25kit/s]


Processing class: 211


100%|██████████| 4.00/4.00 [00:00<00:00, 502it/s]


Processing class: 212


100%|██████████| 4.00/4.00 [00:00<00:00, 709it/s]


Processing class: 213


100%|██████████| 4.00/4.00 [00:00<00:00, 646it/s]


Processing class: 214


100%|██████████| 4.00/4.00 [00:00<00:00, 2.72kit/s]


Processing class: 215


100%|██████████| 4.00/4.00 [00:00<00:00, 2.43kit/s]


Processing class: 216


100%|██████████| 4.00/4.00 [00:00<00:00, 4.13kit/s]


Processing class: 217


100%|██████████| 4.00/4.00 [00:00<00:00, 3.04kit/s]


Processing class: 218


100%|██████████| 4.00/4.00 [00:00<00:00, 1.68kit/s]

Dataset({
    features: ['text', 'l1', 'l2', 'l3', 'label', 'target', 'token_length', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 876
})





In [32]:
# Convert to a pandas DataFrame first if not already one
df_subset = pd.DataFrame(test_dataset_subset)
df_subset.to_csv('/home/bhairavi/om/om5/dbp/test_dataset_subset.csv', index=False)


In [31]:
# Step 1: Import Necessary Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np

# Step 2: Vectorize the Text Data
X = test_df['text']
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_dense = X_vectorized.toarray()
y = test_df['target']

# Step 3: Define a function to select representative samples for each class
def select_representative_samples(X, y, num_samples_per_class):
    selected_indices = []
    for class_label in y.unique():
        print(class_label)
        class_indices = y[y == class_label].index
        X_class = X[class_indices]
        
        # Cluster the data
        num_clusters = num_samples_per_class
        kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X_class)
        
        # Find the closest samples to the cluster centroids
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X_class)
        
        # Collect the selected indices
        selected_indices.extend(class_indices[closest])
    
    return selected_indices

# Step 4: Select samples for each class
num_samples_per_class = 1000 // len(y.unique())  # Adjust the number of samples per class as needed
selected_indices = select_representative_samples(X_dense, y, num_samples_per_class)

# Step 5: Create the subset DataFrame
df_subset = test_df.iloc[selected_indices]

# Display the subset DataFrame
print(df_subset)

160
1
67
64
120
100
98
140
174
200
218
85
83
34
31
138
90
7
72
191
108
105
163
53
11
17
127
195
88
205
52
59
167
47
166
28
81
152
126
180
169
179
24
21
43
35
145
123
148
146
77
192
186
12
54
13
128
151
84
106
125
194
18
135
149
175
147
189
212
181
134
129
207
185
23
19
190
211
102
214
75
89
56
37
155
110
178
193
99
187
172
42
101
164
95
61
170
113
80
168
3
144
0
198
26
133
46
32
111
14
183
141
40
142
60
173
130
121
124
117
36
45
188
79
153
38
8
10
44
94
137
132
209
157
29
116
161
118
210
217
66
48
109
182
4
20
78
16
176
150
104
27
159
119
50
162
115
203
51
184
165
91
69
86
15
30
71
82
107
206
139
22
202
2
213
49
6
87
58
70
131
136
143
215
68
199
73
114
96
171
9
5
76
39
112
154
55
197
196
122
156
33
63
41
216
62
25
57
208
92
97
201
177
74
158
65
103
93
204
                                                    text       l1          l2  \
48492  (In this name, the family name is Law, not Bon...    Agent  Politician   
7209   Ilir Rexhep Meta (born March 24, 1969) is an A...    Agent  Polit

In [32]:
df_subset.shape

(876, 7)

In [33]:
df_subset['label'].nunique()

219

In [35]:
for i in df_subset['target'].value_counts():
    print(i)

4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4


In [36]:
saveDIR

'~/om5/dbp/debertaV3_dbp'

In [38]:

# %%

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Use 'epoch' to evaluate at the end of each epoch
    save_strategy="epoch",  # Also use 'epoch' to save at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,  # Load the best model at the end of training based on metric
    metric_for_best_model='f1',  # Define the metric for evaluating the best model
    logging_dir='./logs',
    logging_steps=10,
)


 

trainer = Trainer(
    model=model,
    args=training_args ,  # Here you will need to make sure that the Trainer is set up correctly
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

 


trainer.train()


# %%


# %%

save_directory = saveDIR
 

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer (optional, but recommended)
tokenizer.save_pretrained(save_directory)

 


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f16a2383d30>>
Traceback (most recent call last):
  File "/home/bhairavi/om/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.1375,0.17291,0.958429,0.958623,0.958948


('/home/bhairavi/om/om5/dbp/debertaV3_dbp/tokenizer_config.json',
 '/home/bhairavi/om/om5/dbp/debertaV3_dbp/special_tokens_map.json',
 '/home/bhairavi/om/om5/dbp/debertaV3_dbp/spm.model',
 '/home/bhairavi/om/om5/dbp/debertaV3_dbp/added_tokens.json',
 '/home/bhairavi/om/om5/dbp/debertaV3_dbp/tokenizer.json')

In [39]:


# %%


# %% [markdown]
# eval dataset performance so that keywords_classes can be fixed

# %%
results = trainer.evaluate()

# Predict using the trained model to get labels and predictions
predictions, labels, _ = trainer.predict(eval_dataset)
predictions = np.argmax(predictions, axis=1)


# %%
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(
    labels,
    predictions,
    target_names=df['label'].unique() , # Adjust this line as per your dataset
    digits=4
)

print(report)


# %% [markdown]
# skyline

# %%
from colorama import Fore, Style

# %%
print(Fore.RED +"TEST DATA IS OUR SKYLINE RESULT")
 
results = trainer.evaluate()

# Predict using the trained model to get labels and predictions
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=1)


# %%
from sklearn.metrics import classification_report
# Generate the classification report
report = classification_report(
    labels,
    predictions,
    target_names=df['label'].unique() , # Adjust this line as per your dataset
    digits=4
)

print(report)


# %%
 










                                   precision    recall  f1-score   support

                          Senator     0.9824    0.9721    0.9772       287
                            Album     0.9381    0.8480    0.8908       125
                         Mountain     0.9965    0.9965    0.9965       285
                  AcademicJournal     1.0000    1.0000    1.0000       286
                 HistoricBuilding     0.9860    0.9860    0.9860       285
                          Reptile     0.8958    1.0000    0.9451        43
                     MilitaryUnit     0.8413    0.8833    0.8618        60
                            Judge     0.9722    0.9790    0.9756       286
                      ChessPlayer     0.9964    0.9684    0.9822       285
                       TradeUnion     1.0000    1.0000    1.0000        22
                          Musical     0.9134    0.8992    0.9062       129
                           Insect     0.9873    0.9873    0.9873       236
                        

                                   precision    recall  f1-score   support

                          Senator     0.9688    0.9608    0.9648       485
                            Album     0.9275    0.8483    0.8861       211
                         Mountain     0.9979    0.9979    0.9979       481
                  AcademicJournal     0.9979    0.9959    0.9969       483
                 HistoricBuilding     0.9938    1.0000    0.9969       480
                          Reptile     0.9600    1.0000    0.9796        72
                     MilitaryUnit     0.7838    0.8529    0.8169       102
                            Judge     0.9726    0.9565    0.9645       483
                      ChessPlayer     0.9979    0.9667    0.9820       481
                       TradeUnion     1.0000    0.9730    0.9863        37
                          Musical     0.9327    0.8899    0.9108       218
                           Insect     0.9565    0.9925    0.9742       399
                        