In [1]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load the JSON data
def load_json_data(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data


# Convert JSON data to pandas DataFrame
def json_to_dataframe(data):
    df = pd.DataFrame(data)
    
    # Ensure 'text' column is string and 'label' is converted to categorical codes
    df['text'] = df['text'].astype(str)
    if df['label'].dtype == 'object':
        df['label'] = df['label'].astype('category').cat.codes  # Converts labels to numerical values
    return df


# Save the DataFrame to Excel (optional)
def save_to_excel(df, excel_path):
    df.to_excel(excel_path, index=False)


# Load and preprocess the dataset
json_path = 'dataset_1.json'  # Path to your JSON dataset
data = load_json_data(json_path)
df = json_to_dataframe(data)

# Convert the DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function for the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Split dataset into training and evaluation sets (80% train, 20% test)
split_datasets = dataset.train_test_split(test_size=0.2) 
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

# Tokenize both train and eval datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

print("Training dataset features:", tokenized_train_dataset.features)
print("Evaluation dataset features:", tokenized_eval_dataset.features)

# Load the pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(df['label'].unique())  # Set the number of unique labels in your dataset
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",            # Directory to save model checkpoints
    evaluation_strategy="epoch",       # Evaluate every epoch
    learning_rate=2e-5,                # Learning rate
    per_device_train_batch_size=4,     # Batch size per device during training
    per_device_eval_batch_size=4,      # Batch size per device during evaluation
    num_train_epochs=3,                # Number of training epochs
    weight_decay=0.01,                 # Weight decay for regularization
    logging_dir='./logs',              # Directory to save logs
    logging_steps=10,                  # Log every 10 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

# Train the model
trainer.train()

# Save the trained model and tokenizer
model.save_pretrained("./trained_1model")
tokenizer.save_pretrained("./trained_1model")

print("Model training complete and saved successfully.")







Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Training dataset features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int8', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
Evaluation dataset features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int8', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.1987485885620117, 'eval_runtime': 13.0718, 'eval_samples_per_second': 0.306, 'eval_steps_per_second': 0.077, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.296785593032837, 'eval_runtime': 12.8609, 'eval_samples_per_second': 0.311, 'eval_steps_per_second': 0.078, 'epoch': 2.0}
{'loss': 2.8628, 'grad_norm': 10.108929634094238, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.3200063705444336, 'eval_runtime': 13.1256, 'eval_samples_per_second': 0.305, 'eval_steps_per_second': 0.076, 'epoch': 3.0}
{'train_runtime': 384.5533, 'train_samples_per_second': 0.125, 'train_steps_per_second': 0.031, 'train_loss': 2.866519053777059, 'epoch': 3.0}
Model training complete and saved successfully.


In [2]:
# Save the trained model and tokenizer
model.save_pretrained("./trained_1model")
tokenizer.save_pretrained("./trained_1model")

print("Model training complete and saved successfully.")


Model training complete and saved successfully.


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the trained model and tokenizer
model_path = "./trained_1model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)


In [4]:
# Function to tokenize a single input sentence
def tokenize_input(text):
    return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

# Example text to test
test_text = "Hello! How are you?"

# Tokenize the test text
inputs = tokenize_input(test_text)


In [5]:
# Disable gradient calculation (no training, just inference)
with torch.no_grad():
    outputs = model(**inputs)

# Get logits from the model output
logits = outputs.logits

# Convert logits to predicted label (with argmax for classification)
predicted_label = torch.argmax(logits, dim=1).item()

print(f"Predicted label: {predicted_label}")


Predicted label: 3


In [9]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Load the trained model and tokenizer
model_path = "./trained_model"  # Path to the directory where the model is saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 2. Load the label mapping from the JSON file (you saved it during training)
label_mapping_path = './label_mapping.json'  # Path to the label mapping file
with open(label_mapping_path, 'r') as f:
    label_mapping = json.load(f)

# 3. Define a function to tokenize the input text
def tokenize_input(text):
    return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

# 4. Function to get the predicted label for a single input text
def get_prediction(text):
    # Tokenize the input text
    inputs = tokenize_input(text)

    # Disable gradient calculation for inference
    with torch.no_grad():
        outputs = model(**inputs)  # Run the model
        logits = outputs.logits  # Get the logits (raw scores)

    # Convert logits to predicted label
    predicted_label_index = torch.argmax(logits, dim=1).item()

    # Convert the predicted label index to the original category using the label mapping
    predicted_label_str = label_mapping[str(predicted_label_index)]  # JSON keys are strings

    return predicted_label_str

# 5. Test the model with a single input text
test_text = "Hello! How are you?"
predicted_label = get_prediction(test_text)

# Print the result
print(f"Input text: {test_text}")
print(f"Predicted label: {predicted_label}")

# 6. Optional: Test the model with multiple inputs
test_texts = ["Hello!", "What is your name?"]
for text in test_texts:
    predicted_label = get_prediction(text)
    print(f"Input text: {text}")
    print(f"Predicted label: {predicted_label}")

Input text: Hello! How are you?
Predicted label: request_joke
Input text: Hello!
Predicted label: request_joke
Input text: What is your name?
Predicted label: request_joke


In [12]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd  # To handle Excel format

# 1. Load the trained model and tokenizer
model_path = "./trained_model"  # Path to the directory where the model is saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 2. Load the label mapping from the JSON file (you saved it during training)
label_mapping_path = './label_mapping.json'  # Path to the label mapping file
with open(label_mapping_path, 'r') as f:
    label_mapping = json.load(f)

# 3. Define a function to tokenize the input text
def tokenize_input(text):
    return tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

# 4. Function to get the predicted label for a single input text
def get_prediction(text):
    # Tokenize the input text
    inputs = tokenize_input(text)

    # Disable gradient calculation for inference
    with torch.no_grad():
        outputs = model(**inputs)  # Run the model
        logits = outputs.logits  # Get the logits (raw scores)

    # Convert logits to predicted label
    predicted_label_index = torch.argmax(logits, dim=1).item()

    # Convert the predicted label index to the original category using the label mapping
    predicted_label_str = label_mapping[str(predicted_label_index)]  # JSON keys are strings

    return predicted_label_str

# 5. Test the model with multiple inputs
test_texts = ["Hello!", "What is your name?", "Tell me a joke.", "What's the weather like?"]

# Collect results in a list of dictionaries for Excel
results = []

for text in test_texts:
    predicted_label = get_prediction(text)
    results.append({"Input text": text, "Predicted label": predicted_label})

# 6. Convert the results to a pandas DataFrame
df = pd.DataFrame(results)

# 7. Save the DataFrame to an Excel file
output_path = "./predictions.xlsx"
df.to_excel(output_path, index=False)

print(f"Predictions saved to {output_path}")


Predictions saved to ./predictions.xlsx


In [11]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/250.9 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/250.9 kB 262.6 kB/s eta 0:00:01
   --------- ----------------------------- 61.4/250.9 kB 365.7 kB/s eta 0:00:01
   ---------------------------------------  245.8/250.9 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------  245.8/250.9 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------  245.8/250.9 kB 1.3 MB/s eta 0:00:01
   -------------------------------------- 250.9/250.9 kB 810.3 kB/s eta 0:00:00
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
In



In [None]:
import pdfplumber
import json

def extract_annotations_from_pdf(pdf_path):
    annotations = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                lines = text.split('\n')
                for line in lines:
                    # Assuming each line is formatted as: "text,label"
                    parts = line.split(',')
                    if len(parts) == 2:
                        text = parts[0].strip().strip('"')
                        label = parts[1].strip().strip('"')
                        annotations.append({"text": text, "label": label})

    return annotations

# Path to your PDF file
pdf_path = 'dataset_annotations.pdf'
annotations = extract_annotations_from_pdf(pdf_path)

# Save to JSON format
with open('annotations.json', 'w') as json_file:
    json.dump(annotations, json_file, indent=4)

print("Annotations extracted and saved to annotations.json")
