In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [None]:
df = pd.read_csv("/kaggle/input/final-dataset/combined_tokenized_research_dataset (1).csv")
df

In [None]:
df

In [None]:
df.columns

In [None]:
df.drop("Unnamed: 0", axis = 1, inplace = True)

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()
df["labels_encoded2"] = lbe.fit_transform(df["labels"])

In [None]:
df.head()

In [None]:
df["labels_encoded2"].unique()

In [None]:
# !pip install wandb
import wandb
wandb.login(key = "1d4c63cc7a44c2478cf37ccdf5c0661bdaa89c2c")

In [None]:
df["labels_encoded2"].nunique()

In [None]:
df["labels_encoded2"].value_counts()[df["labels_encoded2"].value_counts() <= 31]

In [None]:
unknowns = [
    5, 36, 40, 9, 37, 27, 12, 31, 30, 39, 34, 32, 35, 19, 42
]
df.loc[df["labels_encoded2"].isin(unknowns), "labels"] = "UNK"

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()
df["labels_encoded2"] = lbe.fit_transform(df["labels"])

In [None]:
df["labels_encoded2"].nunique()

In [None]:
df["labels"].unique()

In [None]:
df["labels_encoded2"]

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score
import numpy as np
import pandas as pd

class RobertaModel:
    def __init__(self):
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    def dataset_setting(self, dataset_text_att, dataset_label_att):
        text = dataset_text_att.tolist()
        labels = dataset_label_att.tolist()
        data = {"text": text, "labels": labels}
        dataset = Dataset.from_dict(data)
        dataset = dataset.train_test_split(test_size=0.20, seed = 1)
        return dataset

    def tokenize_function(self, examples):
        return self.tokenizer(examples['text'], padding='max_length', truncation=True)

    def mapping(self, dataset):
        tokenized_dataset = dataset.map(self.tokenize_function, batched=True)
        return tokenized_dataset

    def setting_format(self, columns, tokenized_dataset):
        tokenized_dataset.set_format(type='torch', columns=columns)
        return tokenized_dataset

    def init_model(self, num_labels=2):
        model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
        return model

    def training_args(self, output_dir, evaluation_strategy, learning_rate,
                      per_device_train_batch_size, per_device_eval_batch_size,
                      num_train_epochs, weight_decay):
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy=evaluation_strategy,
            learning_rate=learning_rate,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            save_steps = 0,
            save_strategy = "epoch"
        )
        return training_args

    def compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average='weighted')
        recall = recall_score(labels, predictions, average='weighted')
        f1 = f1_score(labels, predictions, average='weighted')
        return {"accuracy": accuracy,"precision": precision, "recall": recall, "f1": f1}

    def trainer_setup(self, model, training_args, train_dataset, eval_dataset):
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
        )
        return trainer

    def train(self, trainer):
        print("Training has been initiated")
        trainer.train()
        print("Training Completed")

    def result(self, trainer):
        print(trainer.evaluate())

    def classification_report(self, trainer, eval_dataset):
        # Make predictions
        predictions = trainer.predict(eval_dataset)
        logits, labels = predictions.predictions, predictions.label_ids

        # Get the predicted classes
        preds = np.argmax(logits, axis=1)

        # Generate the classification report
        report = classification_report(labels, preds, target_names=['B-PARAMETER PROPN', 'B-COMPONENT NOUN', 'O NOUN', 'O AUX', 'O X',
       'O PUNCT', 'O VERB', 'O ADV', 'O ADJ', 'O ADP', 'O NUM',
       'B-PARAMETER NOUN', 'B-ACTION VERB', 'O CCONJ', 'O PRON',
       'B-STATE NOUN', 'B-FUNCTION VERB', 'B-ISSUE VERB', 'O PART',
       'B-COMPONENT ADJ', 'I-ACTION VERB', 'B-FUNCTION NOUN',
       'B-ACTION NOUN', 'O INTJ', 'B-ISSUE NOUN', 'I-STATE VERB',
       'O SCONJ', 'O PROPN', 'I-ACTION ADP', 'B-STATE PROPN',
       'B-ACTION PROPN', 'B-STATE ADJ', 'UNK', 'I-STATE NOUN',
       'B-ISSUE PROPN', 'B-PARAMETER ADJ', 'B-FUNCTION ADJ',
       'B-ISSUE ADJ', 'I-COMPONENT NOUN', 'B-STATE ADV',
       'B-COMPONENT VERB', 'I-FUNCTION VERB', 'B-STATE VERB',
       'I-ACTION NOUN', 'I-PARAMETER NOUN'])
        return report

In [None]:
# Dataset setup
roberta = RobertaModel()

# Split dataset into train and test
dataset = roberta.dataset_setting(df['words'], df['labels_encoded2'])

# Map tokenization function
tokenized_dataset = roberta.mapping(dataset)

# Tokenize each split separately
from imblearn.over_sampling import RandomOverSampler
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np

from imblearn.over_sampling import RandomOverSampler
# Step 1: Extract Training Data
train_df = tokenized_dataset['train'].to_pandas()

# Step 2: Apply Oversampling
ros = RandomOverSampler(random_state=42, sampling_strategy = "minority")
x_resampled, y_resampled = ros.fit_resample(
    np.array(train_df['text']).reshape(-1, 1),
    train_df['labels']
)

# Convert to pandas DataFrame
oversampled_train_df = pd.DataFrame({
    "text": x_resampled.ravel(),
    "labels": y_resampled
})

In [None]:




# Step 3: Convert Back to Dataset
train_dataset = Dataset.from_pandas(oversampled_train_df)  # oversampled_train_df -> train_df

# Step 4: Replace Training Split in DatasetDict
tokenized_dataset = DatasetDict({
    "train": train_dataset,
    "test": dataset['test']
})

# Step 5: Tokenize the Dataset
tokenized_dataset = tokenized_dataset.map(roberta.tokenize_function, batched=True)

# Explicitly select 'train' and 'test' splits for further processing

# Format tokenized datasets
formatted_train = roberta.setting_format(['input_ids', 'attention_mask', 'labels'], tokenized_dataset['train'])
formatted_test = roberta.setting_format(['input_ids', 'attention_mask', 'labels'], tokenized_dataset['test'])

# Initialize model
model = roberta.init_model(num_labels=45)
model.to("cuda")

# Define training arguments
training_args = roberta.training_args(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Setup trainer
trainer = roberta.trainer_setup(
    model=model,
    training_args=training_args,
    train_dataset=formatted_train,
    eval_dataset=formatted_test
)

# Train the model
roberta.train(trainer)

# Evaluate and generate classification report
roberta.result(trainer)
report = roberta.classification_report(trainer, formatted_train)
print(report)