In [1]:
import re
import os
import sys
import io
import json
from dotenv import load_dotenv
import gradio as gr
from pathlib import Path
from datetime import datetime
import requests
import subprocess
from IPython.display import Markdown, display, update_display

# Hugging Face imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer

# Initialize Hugging Face model
def initialize_model(model_name="meta-llama/Llama-3-8B-Instruct"):
    """Initialize the Hugging Face model and tokenizer"""
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # For larger models, you might want to enable model offloading to manage memory
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        device_map=device,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32
    )
    return model, tokenizer

# Load the model once at startup
model_name = "meta-llama/Llama-3-8B-Instruct"  # Change to your preferred model
try:
    model, tokenizer = initialize_model(model_name)
    print(f"Model {model_name} loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Using a fallback model: mistralai/Mistral-7B-Instruct-v0.2")
    try:
        model, tokenizer = initialize_model("mistralai/Mistral-7B-Instruct-v0.2")
    except Exception as e2:
        print(f"Error loading fallback model: {e2}")
        print("Please check your Hugging Face credentials and model availability")

# Prompts definition
system_message = """You are a helpful assistant whose main purpose is to generate datasets for a given business problem."""

def get_user_prompt_tabular(business_problem, dataset_format, file_format, num_samples):
    user_message = f"""
    The business problem is: {business_problem}. \n
    The dataset is expected to be in {dataset_format}. 
    For the dataset types such as tabular or time series implement python code for creating the dataset.
    If the generated dataset contains several entities, i.e. products, users, write the output for these entities into separate files. 
    The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries. 
    The output dataset is stored as a {file_format} file and contains {num_samples} samples. \n    
    """
    return user_message

def get_user_prompt_text(business_problem, dataset_format, file_format):
    user_message = f"""
    The business problem is: {business_problem}. \n
    The dataset is expected to be in {dataset_format}. 
    For the text type return the generated dataset and the python code to write the output to the files.
    If the generated dataset contains several entities, i.e. products, users, write the output for these entities into separate files. 
    The dependencies for python code should include only standard python libraries such as numpy, pandas and built-in libraries. 
    The output dataset is stored as a {file_format} file. \n    
    """
    return user_message

def select_user_prompt(business_problem, dataset_format, file_format, num_samples):
    user_prompt = ""
    if dataset_format == "Text":
        user_prompt = get_user_prompt_text(business_problem, dataset_format, file_format)
    elif dataset_format in ["Tabular", "Time-series"]:
        user_prompt = get_user_prompt_tabular(business_problem, dataset_format, file_format, num_samples)
    return user_prompt

# Function to stream from Hugging Face models
def stream_huggingface(business_problem, dataset_format, file_format, num_samples):
    user_prompt = select_user_prompt(
        business_problem, dataset_format, file_format, num_samples
    )
    
    # Prepare messages in chat format
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]
    
    # Apply chat template based on model architecture
    try:
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
    except:
        # Fallback for models that don't support chat templates
        combined_prompt = f"System: {system_message}\n\nUser: {user_prompt}\n\nAssistant:"
        inputs = tokenizer(combined_prompt, return_tensors="pt").to(model.device)
    
    # Set up a text streamer for incremental output
    streamer = TextStreamer(tokenizer)
    
    # Initialize response tracking
    response = ""
    
    # Create a generator for streaming
    def generate_stream():
        nonlocal response
        
        # Generate with streaming
        with torch.no_grad():
            output_ids = model.generate(
                inputs.input_ids,
                max_new_tokens=2000,
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                streamer=streamer
            )
            
            # Get the full response once
            full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            
            # Extract just the assistant's response by removing the prompt
            try:
                # Different models have various formats, attempt to extract just the response
                if "Assistant:" in full_response:
                    response = full_response.split("Assistant:", 1)[1].strip()
                else:
                    # Get all content after the last occurrence of the user prompt
                    response = full_response[len(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)):].strip()
            except:
                # Fallback - just use the full response
                response = full_response
            
            return response
    
    # Execute generation and return the complete result
    result = generate_stream()
    
    # Return the response for the UI
    yield result
    return result

def generate_dataset(business_problem, dataset_format, file_format, num_samples):
    result = stream_huggingface(business_problem, dataset_format, file_format, num_samples)
    for stream_so_far in result:
        yield stream_so_far
    return result

def extract_code(text):
    # Regular expression to find text between ```python and ```
    match = re.search(r"```python(.*?)```", text, re.DOTALL)

    if match:
        code = match.group(0).strip()  # Extract and strip extra spaces
    else:
        code = ""
        print("No matching substring found.")

    return code.replace("```python\n", "").replace("```", "")

def execute_code_in_virtualenv(text, python_interpreter=sys.executable):
    """
    Execute the given Python code string within the specified virtual environment.
    
    Args:
    - code_str: str, the Python code to execute.
    - venv_dir: str, the directory path to the virtual environment created by pipenv.
    """
    # Check if executing within the specified virtual environment interpreter
    if not python_interpreter:
        raise EnvironmentError("Python interpreter not found in the specified virtual environment.")

    # Prepare the command to execute the code
    code_str = extract_code(text)
    command = [python_interpreter, '-c', code_str]

    # Execute the command
    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print("Output:", result.stdout)
        print("Errors:", result.stderr)
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while executing the code: {e}")
        return f"Error: {e}\n\nStderr: {e.stderr}"
    return result.stdout

# Gradio interface
with gr.Blocks() as ui:
    gr.Markdown("## Create a dataset for a business problem using Hugging Face models")
    with gr.Column():
        business_problem = gr.Textbox(label="Business problem", lines=2)
        dataset_type = gr.Dropdown(
            ["Tabular", "Time-series", "Text"], label="Dataset modality"
        )
        dataset_format = gr.Dropdown(["JSON", "csv", "parquet", "Markdown"], label="Output format")
        num_samples = gr.Number(label="Number of samples (for tabular and time-series data)", value=10, precision=0)
    with gr.Row():
        dataset_run = gr.Button("Create a dataset")
        code_run = gr.Button("Execute code for a dataset")
    with gr.Row():
        dataset_out = gr.Textbox(label="Generated Dataset")
        code_out = gr.Textbox(label="Executed code")
    dataset_run.click(
        generate_dataset,
        inputs=[business_problem, dataset_type, dataset_format, num_samples],
        outputs=[dataset_out]
    )
    code_run.click(execute_code_in_virtualenv, inputs=[dataset_out], outputs=[code_out])

# Run the Gradio app
if __name__ == "__main__":
    ui.launch(inbrowser=True)

Loading model: meta-llama/Llama-3-8B-Instruct
Error loading model: meta-llama/Llama-3-8B-Instruct is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
Using a fallback model: mistralai/Mistral-7B-Instruct-v0.2
Loading model: mistralai/Mistral-7B-Instruct-v0.2


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Using device: cpu


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

KeyboardInterrupt: 