In [1]:
!pip install torch transformers flask beautifulsoup4 requests nltk



In [2]:
from transformers import BertForSequenceClassification, BertTokenizer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import pandas as pd
import re
import torch
import unicodedata
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Clean text function
def clean_text(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r"[\r\n\t\\]+", " ", text)  # Replace newlines, tabs, and backslashes
    text = re.sub(r"[\u2019\u2018]", "'", text)  # Normalize apostrophes
    text = re.sub(r"[\u201c\u201d]", '"', text)  # Normalize quotation marks
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII characters
    return ' '.join(text.split())  # Remove extra spaces

# Load and preprocess the dataset
df = pd.read_json('Psychology-10K.json')
df['input'] = df['input'].apply(lambda x: clean_text(x) if isinstance(x, str) else None)
df['output'] = df['output'].apply(lambda x: clean_text(x) if isinstance(x, str) else None)
df.dropna(subset=['input', 'output'], inplace=True)  # Remove rows with None values

# Label Encoding
label_encoder = LabelEncoder()
df['output'] = label_encoder.fit_transform(df['output'])

# Instantiate tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class SimpleTextDataset(Dataset):
    def __init__(self, tokenizer, input_texts, output_texts, max_length=512):
        self.input_texts = input_texts
        self.output_texts = output_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        source = self.input_texts[idx]
        target = self.output_texts[idx]  # These are now integer labels

        # Tokenize source
        source_encodings = tokenizer.encode_plus(
            source,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': source_encodings['input_ids'].squeeze(),  # Remove extra dimension
            'attention_mask': source_encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(target)  # Ensure labels are tensors
        }

# Prepare dataset
input_data = df['input'].tolist()
output_data = df['output'].tolist()
text_dataset = SimpleTextDataset(tokenizer, input_data, output_data)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=5,
    logging_steps=10,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize model with the correct number of labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=text_dataset,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [4]:
# Train the model
trainer.train()

  0%|          | 0/3940 [00:00<?, ?it/s]

{'loss': 9.2038, 'grad_norm': 9.095703125, 'learning_rate': 4.9987309644670054e-05, 'epoch': 0.0}
{'loss': 9.0968, 'grad_norm': 8.71916389465332, 'learning_rate': 4.9974619289340105e-05, 'epoch': 0.0}
{'loss': 9.4081, 'grad_norm': 9.669864654541016, 'learning_rate': 4.9961928934010156e-05, 'epoch': 0.0}
{'loss': 9.1252, 'grad_norm': 10.478925704956055, 'learning_rate': 4.994923857868021e-05, 'epoch': 0.0}
{'loss': 9.2658, 'grad_norm': 7.879977226257324, 'learning_rate': 4.993654822335025e-05, 'epoch': 0.0}
{'loss': 9.2638, 'grad_norm': 7.147434234619141, 'learning_rate': 4.992385786802031e-05, 'epoch': 0.0}
{'loss': 9.1345, 'grad_norm': 8.272334098815918, 'learning_rate': 4.991116751269036e-05, 'epoch': 0.0}
{'loss': 9.1591, 'grad_norm': 7.074320316314697, 'learning_rate': 4.9898477157360406e-05, 'epoch': 0.0}
{'loss': 9.3999, 'grad_norm': 7.220443248748779, 'learning_rate': 4.988578680203046e-05, 'epoch': 0.0}
{'loss': 9.0425, 'grad_norm': 6.652811527252197, 'learning_rate': 4.9873096

TrainOutput(global_step=3940, training_loss=9.209251903882487, metrics={'train_runtime': 983.142, 'train_samples_per_second': 20.03, 'train_steps_per_second': 4.008, 'train_loss': 9.209251903882487, 'epoch': 2.0})

In [12]:
def predict_response(input_text, model, tokenizer):
    model.eval()  # Set model to evaluation mode
    # Preprocess and tokenize the input text
    encoded_input = tokenizer.encode_plus(
        input_text,
        max_length=512,  # Ensure the length matches what was used during training
        truncation=True,
        padding='max_length',
        return_tensors='pt'  # Return PyTorch tensors
    )

    # Move the tensors to the same device as the model
    encoded_input = {key: tensor.to(model.device) for key, tensor in encoded_input.items()}

    # Make prediction
    with torch.no_grad():
        output = model(**encoded_input)

    # Retrieve the probabilities (logits) and find the predicted class
    logits = output.logits
    predicted_class_index = logits.argmax(-1).item()

    # If you used LabelEncoder during preprocessing, you need to reverse that encoding:
    predicted_label = label_encoder.inverse_transform([predicted_class_index])[0]

    return predicted_label

# Example usage
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')  # Make sure the model is on the right device
input_text = "lonely"
response = predict_response(input_text, model, tokenizer)
print("Predicted Response:", response)

Predicted Response: It's important to cultivate a positive and realistic self-image. Let's work together to explore your beliefs and attitudes about your body, and develop strategies for improving your self-esteem. This might include practicing self-compassion, engaging in physical activity you enjoy, and challenging negative self-talk.
