In [6]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from transformers import logging

# Set Hugging Face transformers logging verbosity
logging.set_verbosity_info()

# Load the JSON data
def load_json_data(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    return data

# Convert JSON data to a pandas DataFrame
def json_to_dataframe(data):
    df = pd.DataFrame(data)
    
    # Create 'text' column from 'question' and map 'label' to numeric values
    df['text'] = df['question'].astype(str)
    label_mapping = {label: idx for idx, label in enumerate(df['label'].unique())}
    df['label'] = df['label'].map(label_mapping)
    
    return df, label_mapping

# Load and preprocess the dataset
json_path = 'dataset.json'  # Path to your JSON file
data = load_json_data(json_path)
df, label_mapping = json_to_dataframe(data)

# Save the label mapping to a JSON file
label_mapping_path = "./label_mappings.json"
with open(label_mapping_path, 'w') as f:
    json.dump(label_mapping, f)

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Initialize the tokenizer and preprocess function
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Split the dataset into train and eval datasets
split_datasets = dataset.train_test_split(test_size=0.2)  # 80% training, 20% evaluation
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Model and training arguments
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))

training_args = TrainingArguments(
    output_dir="./resultss",             # Directory to store results
    evaluation_strategy="epoch",        # Evaluate at the end of each epoch
    logging_dir="./logs",               # Directory to store logs
    logging_steps=10,                   # Log every 10 steps
    learning_rate=2e-5,                 # Learning rate
    per_device_train_batch_size=4,      # Batch size per device
    num_train_epochs=3,                 # Number of training epochs
    weight_decay=0.01,                  # Weight decay
    save_steps=500,                     # Save every 500 steps
    save_total_limit=2,                 # Only save the 2 most recent checkpoints
    report_to="none"                    # Disable reporting to WandB/MLflow
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

# Train the model
trainer.train()



loading configuration file config.json from cache at C:\Users\Test\.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\Test\.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\vocab.txt

{'eval_loss': 3.0858678817749023, 'eval_runtime': 7.6094, 'eval_samples_per_second': 0.526, 'eval_steps_per_second': 0.131, 'epoch': 1.0}


 67%|██████▋   | 8/12 [04:35<02:16, 34.03s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, question. If text, question are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 4
  Batch size = 8

 67%|██████▋   | 8/12 [04:44<02:16, 34.03s/it]

{'eval_loss': 3.1486618518829346, 'eval_runtime': 9.1379, 'eval_samples_per_second': 0.438, 'eval_steps_per_second': 0.109, 'epoch': 2.0}


 83%|████████▎ | 10/12 [05:47<01:08, 34.40s/it]

{'loss': 2.9693, 'grad_norm': 17.780179977416992, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


100%|██████████| 12/12 [06:55<00:00, 34.03s/it]Saving model checkpoint to ./results\checkpoint-12
Configuration saved in ./results\checkpoint-12\config.json
Model weights saved in ./results\checkpoint-12\model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, question. If text, question are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 4
  Batch size = 8

100%|██████████| 12/12 [07:10<00:00, 34.03s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 12/12 [07:10<00:00, 35.84s/it]

{'eval_loss': 3.2011044025421143, 'eval_runtime': 8.8401, 'eval_samples_per_second': 0.452, 'eval_steps_per_second': 0.113, 'epoch': 3.0}
{'train_runtime': 430.0653, 'train_samples_per_second': 0.112, 'train_steps_per_second': 0.028, 'train_loss': 2.9797221024831138, 'epoch': 3.0}





TrainOutput(global_step=12, training_loss=2.9797221024831138, metrics={'train_runtime': 430.0653, 'train_samples_per_second': 0.112, 'train_steps_per_second': 0.028, 'total_flos': 12631371743232.0, 'train_loss': 2.9797221024831138, 'epoch': 3.0})

In [7]:
# Save the model and tokenizer to the specified directory
model_save_path = "./trained_model320"  # Directory where the model will be saved
trainer.save_model(model_save_path)  # Save the model, weights, and configuration
tokenizer.save_pretrained(model_save_path)  # Save the tokenizer as well

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Saving model checkpoint to ./trained_model320
Configuration saved in ./trained_model320\config.json
Model weights saved in ./trained_model320\model.safetensors
tokenizer config file saved in ./trained_model320\tokenizer_config.json
Special tokens file saved in ./trained_model320\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, question. If text, question are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 4
  Batch size = 8
100%|██████████| 1/1 [00:00<00:00, 1002.22it/s]

Evaluation results: {'eval_loss': 3.2011044025421143, 'eval_runtime': 9.5738, 'eval_samples_per_second': 0.418, 'eval_steps_per_second': 0.104, 'epoch': 3.0}





In [14]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pyttsx3

# Load the label mapping
label_mapping_path = "./label_mapping.json"
with open(label_mapping_path, 'r') as f:
    label_mapping = json.load(f)

# Reverse the label mapping to get labels from indices
reversed_label_mapping = {v: k for k, v in label_mapping.items()}

# Load the trained model and tokenizer
model_save_path = "./trained_model320"  # Path to the trained model
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

# Set the model to evaluation mode
model.eval()

# Initialize the text-to-speech engine
engine = pyttsx3.init()

# Define a function to make predictions
def predict(texts):
    # Tokenize the input text
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Ensure to run on the appropriate device
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted labels
    predictions = torch.argmax(outputs.logits, dim=-1)
    return predictions

# Define a function to speak the predicted answers
def speak(text):
    engine.say(text)
    engine.runAndWait()

# Example input data for testing
test_data = [
    "What is your greatest strength?",
]

# Get predictions for the test data
predictions = predict(test_data)

# Map predictions back to labels using the reversed mapping
predicted_labels = [reversed_label_mapping[prediction.item()] for prediction in predictions]

# Print and speak the results
for text, prediction in zip(test_data, predicted_labels):
    # Prepare the speech format
    speech_output = f"For the question '{text}', the predicted answer is: {prediction}. Is there anything else you would like to know?"
    
    print(f"Question: {text}\nPredicted Answer: {prediction}\n")
    
    # Speak the formatted response
    speak(speech_output)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./trained_model320\config.json
Model config BertConfig {
  "_name_or_path": "./trained_model320",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19"
  },
  "initializer_range": 0.02,
  "interme

Question: Tell me about yourself.
Predicted Answer: I admire the company's commitment to innovation and values, and I see this as an opportunity to contribute to meaningful projects.



In [12]:
pip install pyttsx3

Collecting pyttsx3Note: you may need to restart the kernel to use updated packages.





  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Collecting comtypes (from pyttsx3)
  Downloading comtypes-1.4.7-py3-none-any.whl.metadata (6.5 kB)
Collecting pypiwin32 (from pyttsx3)
  Downloading pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
Downloading pyttsx3-2.98-py3-none-any.whl (34 kB)
Downloading comtypes-1.4.7-py3-none-any.whl (226 kB)
Downloading pypiwin32-223-py3-none-any.whl (1.7 kB)
Installing collected packages: pypiwin32, comtypes, pyttsx3
Successfully installed comtypes-1.4.7 pypiwin32-223 pyttsx3-2.98
