In [None]:
!pip install transformers datasets torch pandas

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [4]:
import pandas as pd
import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import numpy as np

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepare_datasets(train_data, val_data, test_data):
    """
    Prepare datasets for training

    Args:
        train_data (pd.DataFrame): Training data
        val_data (pd.DataFrame): Validation data
        test_data (pd.DataFrame): Test data

    Returns:
        Tuple of prepared datasets
    """
    def combine_input(row):
        return (
            f"Emotion: {row['emotion']} | "
            f"Context: {row['context']} | "
            f"Trait: {row['trait']} | "
            f"Topic: {row['topic']} | "
            f"Advice: {row['advice']}"
        )

    # Prepare datasets
    train_data['full_input'] = train_data.apply(combine_input, axis=1)
    val_data['full_input'] = val_data.apply(combine_input, axis=1)
    test_data['full_input'] = test_data.apply(combine_input, axis=1)

    # Convert to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_data[['full_input']])
    val_dataset = Dataset.from_pandas(val_data[['full_input']])
    test_dataset = Dataset.from_pandas(test_data[['full_input']])

    return train_dataset, val_dataset, test_dataset

def tokenize_function(tokenizer, examples):
    """
    Tokenize input texts

    Args:
        tokenizer (GPT2Tokenizer): Tokenizer to use
        examples (dict): Dictionary of input texts

    Returns:
        Tokenized inputs
    """
    return tokenizer(
        examples['full_input'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

def train_advice_model(train_dataset, val_dataset, model_name='gpt2', learning_rate=5e-5):
    """
    Train the advice generation model

    Args:
        train_dataset (Dataset): Training dataset
        val_dataset (Dataset): Validation dataset
        model_name (str): Base model to use
        learning_rate (float): Learning rate for training

    Returns:
        Tuple of trained model and tokenizer
    """
    # Load tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Configure tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    # Move model to device
    model.to(device)

    # Tokenize datasets
    tokenize_func = lambda examples: tokenize_function(tokenizer, examples)
    tokenized_train = train_dataset.map(
        tokenize_func,
        batched=True,
        remove_columns=['full_input']
    )
    tokenized_val = val_dataset.map(
        tokenize_func,
        batched=True,
        remove_columns=['full_input']
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./advice_model',
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=5,
        weight_decay=0.01,
        push_to_hub=False,
        logging_dir='./logs',
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
    )

    # Train model
    trainer.train()

    # Save model
    model.save_pretrained('./advice_model')
    tokenizer.save_pretrained('./advice_model')

    return model, tokenizer

def generate_advice(model, tokenizer, emotion, context, trait, topic, max_length=50):
    """
    Generate concise and accurate advice based on input parameters.

    Args:
        model (GPT2LMHeadModel): Trained model
        tokenizer (GPT2Tokenizer): Tokenizer
        emotion (str): Emotion of the scenario
        context (str): Context of the situation
        trait (str): Personality trait
        topic (str): Topic of advice
        max_length (int): Maximum length of generated advice

    Returns:
        str: Generated advice
    """
    # Construct input text
    input_text = (
        f"Emotion: {emotion} | "
        f"Context: {context} | "
        f"Trait: {trait} | "
        f"Topic: {topic} | "
        f"Advice: "
    )

    # Tokenize input
    inputs = tokenizer(
        input_text,
        return_tensors='pt',
        padding=True,
        truncation=True
    ).to(device)

    # Generate advice with adjusted parameters for accuracy and brevity
    outputs = model.generate(
        inputs.input_ids,
        max_length=100,  # Increase the upper bound
        min_length=50,   # Ensure longer, meaningful responses
        num_return_sequences=1,
        no_repeat_ngram_size=3,  # Prevent repetitive phrases
        top_k=50,  # Consider a slightly broader range of options for creativity
        top_p=0.95,  # Increase flexibility while focusing on high-probability tokens
        temperature=0.7,  # Add slight randomness for natural and nuanced advice
        do_sample=True
    )


    # Decode and extract advice
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract advice part (after "Advice: ")
    advice_start = generated_text.find("Advice: ") + len("Advice: ")
    advice = generated_text[advice_start:].strip()

    return advice

def main():
    """
    Main function to demonstrate model training and inference
    """
    # Load your datasets (replace with your actual data loading)
    try:
        train_data = pd.read_csv("/content/sample_data/train.csv")
        val_data = pd.read_csv("/content/sample_data/val.csv")
        test_data = pd.read_csv("/content/sample_data/test.csv")
    except FileNotFoundError:
        print("Error: CSV files not found. Please ensure train.csv, val.csv, and test.csv exist.")
        return

    # Prepare datasets
    train_dataset, val_dataset, _ = prepare_datasets(train_data, val_data, test_data)

    # Train model
    print("Training advice generation model...")
    model, tokenizer = train_advice_model(train_dataset, val_dataset)

    # Example inference
    print("\nGenerating sample advice...")
    sample_advice = generate_advice(
        model,
        tokenizer,
        emotion="sadness",
        context="I failed my test and feel like I'm disappointing everyone",
        trait="INFJ",
        topic="academics"
    )
    print("Generated Advice:", sample_advice)

if __name__ == "__main__":
    main()

Training advice generation model...


Map:   0%|          | 0/1208 [00:00<?, ? examples/s]

Map:   0%|          | 0/259 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,1.861086
2,2.417000,1.732322
3,2.417000,1.698344
4,1.525000,1.686752
5,1.317400,1.697709


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generating sample advice...
Generated Advice: ____________Failed tests hurt, but you’re still learning. Focus on your goals and approach the test with curiosity. Remember, every effort counts, and you‘re capable of growing. Practice, practice, and trust that your hard work will pay off—you’ve got this.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model and tokenizer from the saved directory
model = GPT2LMHeadModel.from_pretrained('./advice_model')
tokenizer = GPT2Tokenizer.from_pretrained('./advice_model')

# Move the model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the interactive advice generator function
def interactive_advice_generator(model, tokenizer):
    """
    Interactive function to generate advice based on user inputs.

    Args:
        model: The trained GPT-2 model.
        tokenizer: The tokenizer for the model.

    Returns:
        None
    """
    print("Interactive Advice Generator")
    print("=============================")

    while True:
        # Collect user inputs
        emotion = input("Enter the emotion (e.g., sadness, happiness): ").strip()
        context = input("Enter the context of the situation: ").strip()
        trait = input("Enter the personality trait (e.g., INFJ, ENFP): ").strip()
        topic = input("Enter the topic for advice (e.g., academics, relationships): ").strip()

        # Generate advice
        advice = generate_advice(
            model=model,
            tokenizer=tokenizer,
            emotion=emotion,
            context=context,
            trait=trait,
            topic=topic,
            max_length=100  # Adjust as needed
        )

        # Display the generated advice
        print("\nGenerated Advice:")
        print(advice)

        # Ask if the user wants to continue
        continue_response = input("\nDo you want to generate another advice? (yes/no): ").strip().lower()
        if continue_response != 'yes':
            print("\nThank you for using the Advice Generator!")
            break

# Run the interactive generator
interactive_advice_generator(model, tokenizer)


Interactive Advice Generator
Enter the emotion (e.g., sadness, happiness): sadness
Enter the context of the situation: My TikTok flopped, and now I’m doubting my content.
Enter the personality trait (e.g., INFJ, ENFP): ENFJ
Enter the topic for advice (e.g., academics, relationships): social media


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated Advice:
Â Use a different social media channel to self-discovery and celebrate your growth. Embrace your inherent creativity and resilience. You’re on the right track—you’ve got this! Start small, but remember, no one expects perfection.’s perfection is just a small

Do you want to generate another advice? (yes/no): no

Thank you for using the Advice Generator!


In [None]:
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity

# ======= 1. Save the Retrained Model =======
# Define the directory to save the model
output_model_path = "/content/retrained_sentencebert_model"

# Save the model
model.save(output_model_path)
print(f"Model saved at: {output_model_path}")

# ======= 2. Load the Retrained Model =======
# Load the retrained SentenceBERT model
retrained_model = SentenceTransformer(output_model_path)
print("Retrained model loaded successfully!")

# ======= 3. Input GPT-2 Outputs and Reference Sentences =======
# GPT-2 outputs (Generated sentences)
gpt2_outputs = [
    "Embrace this step toward greater fulfillment and growth. Cherish the moment of your journey and remember that love is a sanctuary of unconditional love.",
    "It's actually all about the possibilities. Focus on what'll come your way, and remember - you're capable of amazing things. Your journey is not a destination."
    "This concert is the perfect way to reconnect with loved ones. Music is a way to celebrate your connnection and express yourself. Keep the energy alive!"
    ""
]

# Reference sentences to compare against
reference_sentences = [
    "Regular exercise is important for maintaining good health.",
    "Include fruits and vegetables in your daily meals to stay healthy."
]

# ======= 4. Encode the Sentences =======
# Encode GPT-2 outputs and reference sentences
gpt2_embeddings = retrained_model.encode(gpt2_outputs, convert_to_tensor=True)
reference_embeddings = retrained_model.encode(reference_sentences, convert_to_tensor=True)

# ======= 5. Compute Semantic Similarity =======
# Compute cosine similarity for each pair
similarities = cosine_similarity(gpt2_embeddings, reference_embeddings)

# Print individual similarity scores
print("\nSimilarity Scores for GPT-2 Outputs vs Reference Sentences:")
for i, sim in enumerate(similarities):
    print(f"Generated Sentence {i+1} vs Reference: {sim.item():.4f}")

# Compute the average similarity across all pairs
average_similarity = similarities.mean().item()
print(f"\nAverage Semantic Similarity: {average_similarity:.4f}")

# # ======= 6. Save the Model for Future Use =======
# # Save the model to Google Drive (Optional)
# drive_model_path = "/content/drive/My Drive/retrained_sentencebert_model"
# model.save(drive_model_path)
# print(f"Model saved to Google Drive at: {drive_model_path}")
