In [4]:
import requests
import pandas as pd
import numpy as np
from faker import Faker
import json


# Initialize Faker
fake = Faker()


# Function to interact with Ollama for schema generation
def get_schema_from_ollama(context):
    url = "http://localhost:11434/c api/completion"
    headers = {"Content-Type": "application/json"}
    payload = {
        "model": "llama3.2",  # Specify the Ollama model you are using
        "prompt": f"""
        You are a data generation assistant. Based on the following business context:
        {context}


        Generate a dataset schema with fields, data types, and example values. Provide the schema in JSON format.
        """
    }
    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()['content']
    else:
        print("Error interacting with Ollama:", response.status_code, response.text)
        return None


# Function to generate synthetic data based on schema
def generate_dataset_from_schema(schema, num_rows):
    data = {}
    for field in schema:
        field_name = field['name']
        field_type = field['type']
        if field_type == 'string':
            data[field_name] = [fake.name() for _ in range(num_rows)]
        elif field_type == 'integer':
            data[field_name] = [np.random.randint(1, 100) for _ in range(num_rows)]
        elif field_type == 'float':
            data[field_name] = [round(np.random.uniform(1.0, 100.0), 2) for _ in range(num_rows)]
        elif field_type == 'date':
            data[field_name] = [fake.date_this_year() for _ in range(num_rows)]
    return pd.DataFrame(data)


# Main workflow in Jupyter Notebook
# Step 1: Get input from user
context = input("Describe the business context for the dataset: ")
num_rows = int(input("How many rows of data do you need? "))


# Step 2: Get schema from Ollama
schema_response = get_schema_from_ollama(context)


if schema_response:
    print("AI-Generated Schema:")
    print(schema_response)


    # Step 3: Parse the schema
    schema = json.loads(schema_response)


    # Step 4: Generate dataset
    dataset = generate_dataset_from_schema(schema, num_rows)
    print("Generated Dataset Preview:")
    print(dataset.head())


    # Step 5: Save the dataset
    dataset.to_csv("ollama_generated_dataset.csv", index=False)
    print("Dataset saved as 'ollama_generated_dataset.csv'")
else:
    print("Failed to generate schema from Ollama.")




Error interacting with Ollama: 404 404 page not found
Failed to generate schema from Ollama.


In [2]:
%pip install faker

Collecting faker
  Downloading Faker-33.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.1.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
    --------------------------------------- 0.0/1.9 MB 393.8 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.9 MB 901.1 kB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.9 MB 1.8 MB/s eta 0:00:01
   ------------- -------------------------- 0.7/1.9 MB 2.7 MB/s eta 0:00:01
   -------------------- ------------------- 1.0/1.9 MB 3.4 MB/s eta 0:00:01
   --------------------------- ------------ 1.3/1.9 MB 3.9 MB/s eta 0:00:01
   ------------------------------- -------- 1.5/1.9 MB 3.9 MB/s eta 0:00:01
   ----------------------------------- ---- 1.7/1.9 MB 3.8 MB/s eta 0:00:01
   ---------------------------------------  1.9/1.9 

