In [None]:
import requests     #To make HTTP request
import json         #For JSON data
import pandas as pd # Dataframe
import re           #Regular expression
from google.colab import files  # to download files in Google Collab



#Synthetic Dataset Generation Implementation
### Below is the implementation of generating dataset using given AWS API LLM (LLAMA 2 70B). It sends structured prompts to the model, which generates Python code or function with their corresponding instructions. The generated data is parsed and compiled into a dataset, ensuring each entry has both an non-empty instruction and non-empty output field. Dataset generated is stored and downloaded as csv and JSON format. Dataset generated into set of 4 set of three of size 300 and one 150 size making it total 1050 which is 3 times the size of training dataset. Just to make diversity I changed some examples for generating other data batch.

In [None]:
# AWS API key
AWS_API_KEY = "27F0D1C15C1A46DAB301DDC80C63271D"



### Below function is responsible for sending a prompt to the AWS API and receiving the generated text. The function sends a POST request to the specified URL and returns the generated text from the API's response.

In [None]:
def llama_generate(prompt, api_token, max_gen_len=512, temperature=0.5, top_p=0.9):
    url = 'https://6xtdhvodk2.execute-api.us-west-2.amazonaws.com/dsa_llm/generate'
    body = {
        "prompt": prompt,
        "max_gen_len": max_gen_len,
        "temperature": temperature,
        "top_p": top_p,
        "api_token": api_token
    }
    res = requests.post(url, json=body)
    return json.loads(res.text)["body"]["generation"]

In [None]:
#Testing API, to make sure AWS API is working
prompt = "What distinguishes Fairness from Bias in Machine Learning?"
print(llama_generate(prompt, AWS_API_KEY))



Fairness and bias are two important concepts in machine learning that are often used interchangeably, but they have distinct meanings. Bias refers to the systematic error that a model makes when it consistently under- or over-predicts for a particular group. Fairness, on the other hand, refers to the absence of discrimination in the model's predictions.

In other words, bias is a measure of how well a model is performing overall, while fairness is a measure of how well the model is treating different groups. A model can be biased but fair, or unbiased but unfair.

Here are some key differences between fairness and bias in machine learning:

1. Definition: Bias refers to the systematic error in a model's predictions, while fairness refers to the absence of discrimination in the model's predictions.
2. Focus: Bias focuses on the accuracy of the model's predictions, while fairness focuses on the equity of the model's predictions.
3. Metrics: Bias is typically measured using metrics such

### Below function generates the desired number of synthetci examples repeatedly calling the llama_generate() with constructed prompt until the target number of exampples is reached. The prompt is constructed using meta-instruction and set of examples that guide generation process. It generates both instruction and ouput and also make sure that both are present before adding to list.

In [None]:
def generate_synthetic_examples(api_token, num_examples=300):
    synthetic_examples = []
    while len(synthetic_examples) < num_examples:
        # Construct the prompt using examples and a meta-instruction
        prompt = construct_prompt_with_7_examples()
        response = llama_generate(prompt, api_token)

        # Parse the response to extract the generated example
        generated_parts = response.split('###')
        # Initialize variables to store the parts of the generated example
        instruction, output_example = "", ""

        # Iterate over the parts and extract the relevant sections
        for part in generated_parts:
            part = part.strip()  # Remove any leading/trailing whitespace
            if part.startswith('Instruction:'):
                instruction = part.replace('Instruction:', '').strip()
            elif part.startswith('Output:'):
                output_example = part.replace('Output:', '').strip()

        # Only add the example if all parts are present
        if instruction and output_example:
            synthetic_examples.append({
                 "instruction": instruction,
                 "output": output_example,
        })


    return synthetic_examples

### The meta_instruction variable contains instructions for the AWS API's language model, directing it to generate Python code examples that are unique and complete. The construct_prompt_with_7_examples function combines this meta-instruction with a set of example prompts to form a complete prompt for the API.


In [None]:
meta_instruction = """\
You are tasked with generating Python code examples. For each example, provide a clear Instruction and the corresponding Python code Output as defined in below examples. Ensure that :
- Each example is unique and does not duplicate the examples given below.
- No example has empty fields; both Instruction and Output must be populated with relevant content.
- Do not leave Instructions and it's ouput Incomplete.
"""

def construct_prompt_with_7_examples():
    example_prompts = [
        "### Instruction: Create a lambda expression in Python to filter a list of integer greater than 50.\n"
        "### Output: list1 = [45, 12, 52, 89, 33, 99] filtered_list = list(filter(lambda x: x > 50, list1)) print(filtered_list)\n",

        "### Instruction: Write a Python program to convert ratings in a list of strings to a float.\n"
        "### Output: def str_to_float(ratings): return [float(x) for x in ratings]\n",

        "### Instruction: Develop a function in Python to take as input two array of integers and swap their elements.\n"
        "### Output: def swap(arr1, arr2): assert len(arr1) == len(arr2) for i in range(len(arr1)): temp = arr1[i] arr1[i] = arr2[i] arr2[i] = temp\n",

        "### Instruction: Write a Python script to sort a list of numbers.\n"
        "### Output: def sort_list(unsorted_list): return sorted(unsorted_list)\n",

        "### Instruction: Using Python, create a function that calculates the objective function of a linear equation.\n"
        "### Output: def linear_eq(a, b, c): return a*x + b*y - c # For example: result = linear_eq(3, 5, 7) print(result)\n",

        "### Instruction: Generate a program in Python to convert all characters of a string in lowercase.\n"
        "### Output: def to_lower(string): return string.lower()\n",

        "### Instruction: Create a 3-layer artificial neural network using Python and print the outcome.\n"
        "### Output: import numpy as np # define the 3 layers # input_layer = np.array([2, 3]) hidden_layer = np.array([[0.1, 0.4], [0.8, 0.6], [0.3, 0.9]]) output_layer = np.array([0.3, 0.7]) # compute the output of the 3-layer network # hidden_layer_output = np.dot(input_layer, hidden_layer) output = np.dot(hidden_layer_output, output_layer) print(output)\n",
    ]
    # Combine the examples and the meta_instruction into one prompt
    combined_prompt = meta_instruction + "\n" + "\n".join(example_prompts)

    return combined_prompt



Saving the list of generating examples in csv and JSON format.

In [None]:
# Generate the synthetic dataset
synthetic_data_1= generate_synthetic_examples(AWS_API_KEY, num_examples=300)

# Save the synthetic dataset to a CSV file
df = pd.DataFrame(synthetic_data_1)
df.to_csv('synthetic_dataset_4.csv', index=False)

# Download the file to your local machine
files.download('synthetic_dataset_4.csv')
# Save the synthetic dataset to a JSON file
json_filename = 'synthetic_dataset_4.json'
df.to_json(json_filename, orient='records', lines=True, indent=4)
print(f"Saved the dataset to {json_filename}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved the dataset to synthetic_dataset_4.json


In [None]:
len(df)

300