# Synthetic Dataset Generator

[![GitHub](https://img.shields.io/badge/GitHub-@voxdroid-181717?style=for-the-badge&logo=github)](https://github.com/voxdroid)
![Python](https://img.shields.io/badge/Python-3776AB?style=for-the-badge&logo=python&logoColor=white)
![Pandas](https://img.shields.io/badge/Pandas-150458?style=for-the-badge&logo=pandas&logoColor=white)
![NumPy](https://img.shields.io/badge/NumPy-013243?style=for-the-badge&logo=numpy&logoColor=white)
![Google Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)

This notebook generates synthetic datasets for various applications. It uses advanced AI models to create realistic and diverse data based on your specifications.

## Notes

- Adjust parameters as needed for your specific use case
- Verify the quality and relevance of the generated data for your application

For questions or improvements, contact: [@VoxDroid](https://github.com/voxdroid) or visit the repo: [Synthetic-Dataset-Generator-DataLLM](https://github.com/VoxDroid/Synthetic-Dataset-Generator-DataLLM)

In [None]:
# @title # **`Dataset Generation`**

class tc:
    # ANSI escape codes for text colors
    reset = "\033[0m"         # Reset to default
    black = "\033[30m"        # Black
    red = "\033[31m"          # Red
    green = "\033[32m"        # Green
    yellow = "\033[33m"       # Yellow
    blue = "\033[34m"         # Blue
    magenta = "\033[35m"      # Magenta
    cyan = "\033[36m"         # Cyan
    white = "\033[37m"        # White

    # Background colors
    bg_black = "\033[40m"     # Black background
    bg_red = "\033[41m"       # Red background
    bg_green = "\033[42m"     # Green background
    bg_yellow = "\033[43m"    # Yellow background
    bg_blue = "\033[44m"      # Blue background
    bg_magenta = "\033[45m"   # Magenta background
    bg_cyan = "\033[46m"      # Cyan background
    bg_white = "\033[47m"     # White background

    # Bright colors
    bright_black = "\033[90m"  # Bright Black (Gray)
    bright_red = "\033[91m"    # Bright Red
    bright_green = "\033[92m"  # Bright Green
    bright_yellow = "\033[93m" # Bright Yellow
    bright_blue = "\033[94m"   # Bright Blue
    bright_magenta = "\033[95m" # Bright Magenta
    bright_cyan = "\033[96m"   # Bright Cyan
    bright_white = "\033[97m"  # Bright White

    # Text styles
    bold = "\033[1m"           # Bold
    italic = "\033[3m"         # Italic
    underline = "\033[4m"      # Underline

import sys
import importlib
from IPython.display import clear_output
import subprocess

def check_and_install(package_name):
    try:
        if package_name in sys.modules:
            print(f"\n{tc.blue}{package_name} is already installed and accessible from sys.modules.\n{tc.reset}")
            return True

        if importlib.util.find_spec(package_name) is not None:
            print(f"\n{tc.blue}{package_name} is already installed and accessible.\n{tc.reset}")
            return True

    except Exception as e:
        print(f"{tc.red}An unexpected error occurred during checks: {e}{tc.reset}")

    print(f"\n{tc.red}{package_name} is not installed. Attempting installation...\n{tc.reset}")

    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        clear_output()
        print(f"\n{tc.green}{package_name} installed successfully.{tc.reset}")

        try:
            module = importlib.import_module(package_name)
            print(f"\n{tc.green}{package_name} has been successfully loaded after installation.\n{tc.reset}")
            return True

        except ModuleNotFoundError:
            return False

    except subprocess.CalledProcessError as install_error:
        print(f"{tc.red}Failed to install {package_name}. Installation error: {install_error}{tc.red}")
        return False

    except Exception as e:
        print(f"{tc.red}An unexpected error occurred during installation or import: {e}{tc.reset}")
        return False

package_name = "DataLLM"

if check_and_install(package_name):
    datallm = importlib.import_module(package_name)
else:
    print(f"\n{tc.green}Setup Finished.\n{tc.reset}")

from IPython.display import clear_output
from datetime import datetime
import pandas as pd
import re
import time
from datallm import DataLLM
from tqdm import tqdm
import os

# @markdown # **`Dataset Configuration`**
DataLLM_apikey = "zpka_XXXXX"  # @param {type:"string"}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Enter your Mostly.AI DataLLM API Key here to access the synthetic data generation service.*</small>

# Initialize DataLLM
datallm = DataLLM(api_key=DataLLM_apikey, base_url='https://data.mostly.ai')

# @markdown # **`Dataset Description`**
data_description = "Sample Dataset Description"  # @param {type:"string"}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Provide a description of the dataset you wish to generate. This guides the data model in producing relevant synthetic data.*</small>

# @markdown ---

# @markdown # **`User Input Column Configuration`**
user_input_prompt = "Sample User Input Prompt"  # @param {type:"string"}
user_input_data_type = "string"  # @param ["string", "integer", "float", "boolean"]
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Define the prompt that describes the type of user input you'd like to generate.*</small>
user_input_max_tokens = 64  # @param {type:"slider", min:1, max:64}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set the maximum token limit for the user input, controlling the length of the generated text.*</small>

# @markdown # **`Chatbot Output Column Configuration`**
chatbot_output_prompt = "Sample Chatbot Output Prompt"  # @param {type:"string"}
chatbot_output_data_type = "string"  # @param ["string", "integer", "float", "boolean"]
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Define the chatbot's response prompt to guide the model's output.*</small>
chatbot_output_max_tokens = 64  # @param {type:"slider", min:1, max:64}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set the maximum token limit for the chatbot output.*</small>

# @markdown # **`Intent Column Configuration`**
intent = "Sample Intent" # @param {type:"string"}
intent_data_type = "string" # @param {type:"string"}
# @markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Intent represents the core purpose of the user's query.*</small>
intent_max_tokens = 8  # @param {type:"slider", min:1, max:32}
# @markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set a concise length for intent categorization.*</small>

# @markdown # **`Sentiment Column Configuration`**
sentiment = "Sample Sentiment" # @param {type:"string"}
sentiment_data_type = "string" # @param {type:"string"}
# @markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Sentiment reflects the user's emotional tone.*</small>
sentiment_max_tokens = 8 # @param {type:"slider", min:1, max:32}
# @markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set a concise length for sentiment categorization.*</small>

# @markdown # **`Difficulty Level Column Configuration`**
diff_level = "Sample Difficulty Level" # @param {type:"string"}
diff_level_data_type = "string" # @param {type:"string"}
# @markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Difficulty level corresponds to the complexity of the query.*</small>
diff_level_max_tokens = 8  # @param {type:"slider", min:1, max:32}
# @markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set a concise length for difficulty categorization.*</small>

# @markdown ---

# Define columns for data generation
columns = {
    "input_part1": {
        "prompt": user_input_prompt,
        "dtype": user_input_data_type,
        "max_tokens": user_input_max_tokens
    },
    "output_part1": {
        "prompt": chatbot_output_prompt,
        "dtype": chatbot_output_data_type,
        "max_tokens": chatbot_output_max_tokens
    },
    "intent": {
        "prompt": intent,
        "dtype": intent_data_type,
        "max_tokens": intent_max_tokens
    },
    "sentiment": {
        "prompt": sentiment,
        "dtype": sentiment_data_type,
        "max_tokens": sentiment_max_tokens
    },
    "difficulty_level": {
        "prompt": diff_level,
        "dtype": diff_level_data_type,
        "max_tokens": diff_level_max_tokens
    }
}

# @markdown # **`Dataset Structure`**
use_custom_rows = True  # @param {type:"boolean"}
data_rows_option = "10"  # @param ["10", "100", "1000", "10000"]
data_rows_custom = 50  # @param {type:"integer"}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set the number of rows to generate. Choose predefined options (`10`, `100`, `1,000`, `10,000`) or enable custom rows.*</small>

# Determine the number of rows
data_rows = int(data_rows_option) if not use_custom_rows else data_rows_custom

# @markdown # **`Additional Parts Until the Response is Complete`**
maximum_iterations = 2  # @param {type:"slider", min:1, max:5}
ap_data_type = "string"  # @param ["string", "integer", "float", "boolean"]
ap_max_tokens = 64  # @param {type:"slider", min:1, max:64}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Set the number of iterations to complete responses if the model does not return a full sentence initially.*</small>

# @markdown # **`Dataset Filename (saved as .csv file)`**
csv_filename = "Synthetic_Dataset"  # @param {type:"string"}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Enter a name for the CSV file where the dataset will be saved.*</small>

# @markdown # **`Include Timestamp in Filename`**
include_timestamp = True  # @param {type:"boolean"}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Enable to add a timestamp to the filename for easier tracking of versions.*</small>

# @markdown # **`Save to Google Drive`**
save_to_gdrive = True  # @param {type:"boolean"}
date = datetime.now().strftime("%Y%m%d")
gdrive_folder = f"Dataset-Generator/Dataset-{date}"
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Check this box to save the dataset to your Google Drive.*</small>

# @markdown # **`Loop (Cycle) Generation Option`**
number_of_cycles = 1  # @param {type:"integer"}
#@markdown - <small style="color: gray; font-size: 0.85em;"> **Note**: *Specify the number of cycles to generate datasets with the same configuration.*</small>

# Mount Google Drive if needed
if save_to_gdrive:
    from google.colab import drive
    print(f"{tc.blue}Mounting Google Drive...\n{tc.reset}")
    drive.mount('/content/drive')
    print(f"{tc.green}\nSuccessfully mounted to Google Drive.{tc.reset}")

# Function to check if the text ends with a complete sentence
def is_complete_sentence(text):
    return bool(re.search(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s*$', text))

# Function to generate additional parts until the response is complete
def generate_complete_text(initial_text, prompt, data_description, max_iterations=maximum_iterations):
    complete_text = initial_text
    iterations = 0
    while not is_complete_sentence(complete_text) and iterations < max_iterations:
        try:
            additional_part = datallm.enrich(
                data=pd.DataFrame({'text': [complete_text]}),
                prompt=prompt,
                data_description=data_description,
                dtype=ap_data_type,
                max_tokens=ap_max_tokens,
                progress_bar=False
            )
            additional_text = additional_part.iloc[0]
            complete_text += " " + additional_text
            iterations += 1
        except Exception as e:
            print(f"{tc.red}Error while enriching text: {e}{tc.reset}")
            break
    return complete_text

print(f"\n{tc.blue}{tc.bold}Preparing to generate datasets...\n{tc.reset}")

# Loop to generate multiple datasets
for cycle in range(1, number_of_cycles + 1):
    print(f"\n{tc.cyan}Generating dataset cycle {cycle} of {number_of_cycles}...\n{tc.reset}")

    try:
        time.sleep(1)
        synthetic_data = datallm.mock(
            n=data_rows,
            data_description=data_description,
            columns=columns,
            progress_bar=True
        )
    except Exception as e:
        print(f"{tc.red}Error while generating synthetic data: {e}{tc.reset}")
        continue

    # Generate complete input and output
    print(f"\n{tc.blue}Proceeding to process datasets...\n{tc.reset}")

    with tqdm(total=len(synthetic_data), desc=f"{tc.cyan}Processing dataset") as pbar:
        for i in range(len(synthetic_data)):
            synthetic_data.at[i, 'input'] = generate_complete_text(
                synthetic_data.at[i, 'input_part1'],
                "Continue the student query.",
                data_description
            )
            synthetic_data.at[i, 'output'] = generate_complete_text(
                synthetic_data.at[i, 'output_part1'],
                "Continue the chatbot response.",
                data_description
            )
            pbar.update(1)

    # Drop the part columns
    synthetic_data = synthetic_data.drop(columns=['input_part1', 'output_part1'])

    # Add an ID column to the dataset
    synthetic_data.insert(0, 'id', range(1, len(synthetic_data) + 1))

    column_order = ['id', 'input', 'output', 'intent', 'sentiment', 'difficulty_level']
    synthetic_data = synthetic_data[column_order]

    # Save to CSV with optional timestamp
    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S") if include_timestamp else ""
    csv_path = f'Dataset-Generator/Dataset-{date}/{csv_filename}_{cycle}{f"_{timestamp}" if include_timestamp else ""}.csv'

    try:
        directory = os.path.dirname(csv_path)
        if not os.path.exists(directory):
            os.makedirs(directory)
        synthetic_data.to_csv(csv_path, index=False)
        print(f"\n\n{tc.green}Dataset cycle {cycle} saved successfully at: {csv_path}{tc.reset}")

        # Optionally save to Google Drive
        if save_to_gdrive:
            gdrive_path = f'/content/drive/My Drive/{gdrive_folder}/{csv_filename}_{cycle}{f"_{timestamp}" if include_timestamp else ""}.csv'
            os.makedirs(os.path.dirname(gdrive_path), exist_ok=True)
            synthetic_data.to_csv(gdrive_path, index=False)
            print(f"\n{tc.green}Dataset cycle {cycle} also saved to Google Drive at: {gdrive_path}{tc.reset}")
    except Exception as e:
        print(f"{tc.red}Error while saving dataset: {e}{tc.reset}")

print(f"\n{tc.bright_green}{tc.bold}Dataset generation finished.{tc.green}")
