# TRAINING PART

In [1]:
!pip install transformers



In [2]:
!pip install datasets



In [3]:
!pip install evaluate



In [4]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments , Trainer
import evaluate
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
data_path = '/content/C_TRAIN.csv'
text_column_name = 'Content'
label_column_name = 'Label'

In [6]:
model_name = "bert-base-uncased"
test_size = 0.3
num_labels = 9

In [7]:
df = pd.read_csv(data_path)

In [8]:
df.dropna(inplace = True)
df.isnull().sum()

Unnamed: 0    0
Label         0
Content       0
dtype: int64

In [9]:
# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Assuming df is your DataFrame and 'Label' is the column to be encoded
labels = df['Label']

# Fit the encoder to your labels and transform them
encoded_labels = label_encoder.fit_transform(labels)

# Replace the original 'Label' column with the encoded labels
df['Label'] = encoded_labels

In [10]:
print(df[:5])

   Unnamed: 0  Label                                            Content
0           0      0  tissue present invention obtain suitable sourc...
1           1      6  according aspect INvENTioN centrifuge include ...
2           2      2  staurosporine derivative anti cancer activity ...
3           3      1  various implementation different blade razor d...
4           4      0  purpose promote understanding principle invent...


In [11]:
df_train , df_test = train_test_split(df , test_size = test_size)

In [12]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [13]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'Label', 'Content', '__index_level_0__'],
    num_rows: 39532
})

In [14]:
test_dataset

Dataset({
    features: ['Unnamed: 0', 'Label', 'Content', '__index_level_0__'],
    num_rows: 16943
})

In [15]:
tok = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(examples):
    return tok(examples["Content"], truncation = True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
tokenized_train = train_dataset.map(preprocess_function, batched = True)

Map:   0%|          | 0/39532 [00:00<?, ? examples/s]

In [17]:
tokenized_test = test_dataset.map(preprocess_function, batched = True)

Map:   0%|          | 0/16943 [00:00<?, ? examples/s]

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_name , num_labels =num_labels )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
data_collator = DataCollatorWithPadding(tokenizer = tok)

In [20]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits , labels = eval_pred
    predictions = np.argmax(logits , axis = -1)
    return metric.compute(predictions = predictions , references = labels)

In [21]:
tokenized_train = tokenized_train.rename_column('Label', 'label')

tokenized_test = tokenized_test.rename_column('Label','label')

In [22]:
tokenized_train = tokenized_train.rename_column('Content', 'text')

tokenized_test = tokenized_test.rename_column('Content','text')

In [23]:
!pip install transformers[torch] -U



In [24]:
training_args = TrainingArguments(
    output_dir = "/kaggle/working/",
    learning_rate = 1e-5,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 3,
    weight_decay = 0.01,
    save_steps=1500,  # Save checkpoint every 1500 steps
    evaluation_strategy="steps",  # Evaluate every specified number of steps
    eval_steps=500,  # Evaluate every 500 steps
    logging_strategy="steps",  # Log every specified number of steps
    logging_steps=500,
    report_to="none"
)

trainer = Trainer(
    model = model ,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tok,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)



In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,1.7137,1.436821,0.5139
1000,1.3855,1.342583,0.529068
1500,1.3017,1.226523,0.588208
2000,1.2364,1.197754,0.600248


# INFERENCE PART

In [None]:
# Load test data from test.csv
test_df = pd.read_csv("/kaggle/input/cleaned2/new_cleaned/Test_preprocessed.csv")

In [None]:
# Rename columns to match training data
test_df.rename(columns={"Content": "text"}, inplace=True)

In [None]:
# Load the trained model and tokenizer
model_name = "/kaggle/input/distilll2"  # Replace with your model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
def inference(test_df):
    test_texts = test_df["text"].tolist()
    test_inputs = tok(test_texts, truncation=True, padding=True)
    test_dataset = Dataset.from_dict(test_inputs)

    trainer = Trainer(model=model)
    predictions = trainer.predict(test_dataset)

    return predictions

In [None]:
test_df

In [None]:
# Generate predictions for the preprocessed test data
test_predictions = inference(test_df)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Extract predicted labels from the output
predicted_labels = test_predictions.predictions.argmax(axis=1)

In [None]:
submission_df = pd.DataFrame({
    "text": test_df["text"],
    "predicted_label": predicted_labels
})
# Save the results to a CSV file
submission_df.to_csv("submissionn.csv", index=False)

In [None]:
# Load the submission file
submission_df = pd.read_csv("submissionn.csv")

# Invert the labels from 0 to 8 to 1 to 9
submission_df["predicted_label"] += 1

# Save the modified submission file
submission_df.to_csv("submission_inverted_Label.csv", index=False)
