In [3]:
from datasets import load_dataset

dataset = load_dataset("tatsu-lab/alpaca", streaming=True)

# Collect first 1000 samples
samples = []
for i, example in enumerate(dataset['train']):
    if i >= 1000:
        break
    samples.append(example)

print("Alpaca sample structure:")
print(samples[0])

# Format for training
def format_alpaca(example):
    if example['input']:
        text = f"### Instruction:\n{example['instruction']}\n### Input:\n{example['input']}\n### Response:\n{example['output']}"
    else:
        text = f"### Instruction:\n{example['instruction']}\n### Response:\n{example['output']}"
    return {'text': text}

# Apply formatting
formatted_samples = [format_alpaca(sample) for sample in samples]

Alpaca sample structure:
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}


In [4]:
import pprint

In [6]:
pprint.pprint(formatted_samples, indent=4)

[   {   'text': '### Instruction:\n'
                'Give three tips for staying healthy.\n'
                '### Response:\n'
                '1.Eat a balanced diet and make sure to include plenty of '
                'fruits and vegetables. \n'
                '2. Exercise regularly to keep your body active and strong. \n'
                '3. Get enough sleep and maintain a consistent sleep '
                'schedule.'},
    {   'text': '### Instruction:\n'
                'What are the three primary colors?\n'
                '### Response:\n'
                'The three primary colors are red, blue, and yellow.'},
    {   'text': '### Instruction:\n'
                'Describe the structure of an atom.\n'
                '### Response:\n'
                'An atom is made up of a nucleus, which contains protons and '
                'neutrons, surrounded by electrons that travel in orbits '
                'around the nucleus. The protons and neutrons have a positive '
             

In [7]:
# Load conversational dataset
dataset = load_dataset("daily_dialog", split="train[:1000]",trust_remote_code=True)

print("Daily Dialog sample structure:")
print(dataset[0])

def format_dialog(example):
    dialog = example['dialog']
    conversation = ""
    for i, turn in enumerate(dialog):
        speaker = "Human" if i % 2 == 0 else "Assistant"
        conversation += f"{speaker}: {turn}\n"
    return {'text': conversation.strip()}

formatted_dataset = [format_dialog(example) for example in dataset]

Daily Dialog sample structure:
{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ', ' You know that is tempting but is really not good for our fitness . ', ' What do you mean ? It will help us to relax . ', " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ", " I guess you are right.But what shall we do ? I don't feel like sitting at home . ", ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ', " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ", ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ', " Good.Let ' s go now . ", ' All right . '], 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4], 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}


In [8]:
pprint.pprint(formatted_dataset[:4], indent=4)

[   {   'text': 'Human: Say , Jim , how about going for a few beers after '
                'dinner ? \n'
                'Assistant:  You know that is tempting but is really not good '
                'for our fitness . \n'
                'Human:  What do you mean ? It will help us to relax . \n'
                "Assistant:  Do you really think so ? I don't . It will just "
                'make us fat and act silly . Remember last time ? \n'
                "Human:  I guess you are right.But what shall we do ? I don't "
                'feel like sitting at home . \n'
                'Assistant:  I suggest a walk over to the gym where we can '
                'play singsong and meet some of our friends . \n'
                "Human:  That's a good idea . I hear Mary and Sally often go "
                'there to play pingpong.Perhaps we can make a foursome with '
                'them . \n'
                'Assistant:  Sounds great to me ! If they are willing , we '
                '

In [9]:
# Medical domain dataset
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards", split="train[:1000]")

print("Medical Flashcards sample structure:")
print(dataset[0])

def format_medical(example):
    return {
        'text': f"Medical Question: {example['input']}\nAnswer: {example['output']}"
    }

formatted_dataset = [format_medical(example) for example in dataset]

Medical Flashcards sample structure:
{'input': 'What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels?', 'output': 'Very low Mg2+ levels correspond to low PTH levels which in turn results in low Ca2+ levels.', 'instruction': 'Answer this question truthfully'}


In [10]:
pprint.pprint(formatted_dataset, indent=4)

[   {   'text': 'Medical Question: What is the relationship between very low '
                'Mg2+ levels, PTH levels, and Ca2+ levels?\n'
                'Answer: Very low Mg2+ levels correspond to low PTH levels '
                'which in turn results in low Ca2+ levels.'},
    {   'text': 'Medical Question: What leads to genitourinary syndrome of '
                'menopause (atrophic vaginitis)?\n'
                'Answer: Low estradiol production leads to genitourinary '
                'syndrome of menopause (atrophic vaginitis).'},
    {   'text': 'Medical Question: What does low REM sleep latency and '
                'experiencing hallucinations/sleep paralysis suggest?\n'
                'Answer: Low REM sleep latency and experiencing '
                'hallucinations/sleep paralysis suggests narcolepsy.'},
    {   'text': 'Medical Question: What are some possible causes of low PTH '
                'and high calcium levels?\n'
                'Answer: PTH-independent hype

In [11]:
# An Example from local machine

from bs4 import BeautifulSoup
import json

def scrape_local_faq(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    soup = BeautifulSoup(content, 'html.parser')
    faq_data = []

    questions = soup.select('.faq-question')
    answers = soup.select('.faq-answer')

    min_len = min(len(questions), len(answers))
    if min_len == 0:
        print(f"No Q&A found in {file_path}")
        return []

    for i in range(min_len):
        q_text = questions[i].get_text().strip()
        a_text = answers[i].get_text().strip()
        faq_data.append({
            "instruction": q_text,
            "output": a_text,
            "source": file_path # Source is the local file path
        })
    return faq_data

# --- Run the local scraper ---
local_faq_file = 'C:\\Users\\amanr\\OneDrive\\Desktop\\AmanTest.html'
scraped_local_data = scrape_local_faq(local_faq_file)

print("\n--- Scraped Data from Local File ---")
print(json.dumps(scraped_local_data, indent=2))


--- Scraped Data from Local File ---
[
  {
    "instruction": "What is your return policy?",
    "output": "You can return items within 30 days of purchase, provided they are in their original condition and packaging.",
    "source": "C:\\Users\\amanr\\OneDrive\\Desktop\\AmanTest.html"
  },
  {
    "instruction": "How do I track my order?",
    "output": "Once your order ships, you will receive an email with a tracking number and a link to the carrier's website.",
    "source": "C:\\Users\\amanr\\OneDrive\\Desktop\\AmanTest.html"
  },
  {
    "instruction": "Do you offer international shipping?",
    "output": "Yes, we ship to over 100 countries worldwide. Shipping costs and delivery times vary by destination.",
    "source": "C:\\Users\\amanr\\OneDrive\\Desktop\\AmanTest.html"
  },
  {
    "instruction": "What payment methods do you accept?",
    "output": "We accept major credit cards (Visa, Mastercard, Amex), PayPal, and Apple Pay.",
    "source": "C:\\Users\\amanr\\OneDrive\\Deskt

In [12]:
import random
import os
import json # For pretty printing
from groq import Groq # Import the Groq client
from dotenv import load_dotenv
env_path = r'E:\YTReusable\.env'
load_dotenv(env_path)
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

class SyntheticDataGenerator:
    def __init__(self):
        # Initialize Groq client
        # It automatically picks up the API key from the GROQ_API_KEY environment variable
        self.client = Groq()
        self.groq_model = "llama3-8b-8192" 
        self.templates = {
            "order_inquiry": [
                "Where is my order #{order_id}?",
                "Can you track order #{order_id}?",
                "Order #{order_id} status please",
                "What's the update on order #{order_id}?",
                "When will order #{order_id} be delivered?"
            ],
            "return_request": [
                "I want to return {product}",
                "How do I return my {product}?",
                "Return policy for {product}",
                "Can I get a refund for {product}?",
                "What's the process for returning a defective {product}?"
            ],
            "technical_support": [
                "My {device} is not turning on.",
                "How to troubleshoot {software} error code {error_code}?",
                "The {feature} on my {device} is not working."
            ]
        }

        self.variables = {
            "order_id": ["12345", "67890", "ABC123", "XYZ987", "MNO456"],
            "product": ["laptop", "phone", "headphones", "smartwatch", "vacuum cleaner"],
            "device": ["laptop", "smart TV", "tablet", "router"],
            "software": ["Windows", "MacOS", "Android app", "website"],
            "error_code": ["E-001", "404", "SYS-BLCK"]
        }

    def generate_examples(self, category, count=10):
        examples = []
        templates = self.templates.get(category)
        if not templates:
            print(f"Error: Category '{category}' not found in templates.")
            return []

        print(f"Generating {count} examples for category '{category}' using Groq model '{self.groq_model}'...")

        for i in range(count):
            template = random.choice(templates)

            # Fill in variables
            filled_template = template
            for var, values in self.variables.items():
                if f"{{{var}}}" in filled_template:
                    filled_template = filled_template.replace(f"{{{var}}}", random.choice(values))

            # Generate appropriate response using Groq API
            response = self.generate_response(filled_template, category)

            examples.append({
                "instruction": filled_template, # Renamed 'input' to 'instruction'
                "output": response,
                "category": category
            })
            print(f"Generated example {i+1}/{count}")

        return examples

    def generate_response(self, question, category):
        # Determine specific instructions for the AI based on category
        if category == "order_inquiry":
            system_prompt = "You are a polite and efficient customer service agent. Provide order status information or guide the user on how to track their order."
        elif category == "return_request":
            system_prompt = "You are a customer service agent handling returns. Explain the return process clearly and professionally."
        elif category == "technical_support":
            system_prompt = "You are a technical support agent. Provide clear, concise troubleshooting steps or explain common solutions for device/software issues."
        else:
            system_prompt = "You are a helpful customer service assistant."

        prompt_text = f"""
        Customer query: "{question}"
        Please provide a professional and friendly customer service response.
        """

        try:
            chat_completion = self.client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": system_prompt,
                    },
                    {
                        "role": "user",
                        "content": prompt_text,
                    }
                ],
                model=self.groq_model,
                temperature=0.7, # Adjust for creativity (0.0 for deterministic, 1.0 for more creative)
                max_tokens=200, # Limit response length
            )
            return chat_completion.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error calling Groq API: {e}")
            return "Apologies, I'm unable to generate a response at this moment."

# --- Live Demo ---
if __name__ == "__main__":
    # Ensure your GROQ_API_KEY environment variable is set before running this.
    # In Colab: os.environ["GROQ_API_KEY"] = "your_actual_groq_api_key_here"

    generator = SyntheticDataGenerator()

    # Generate data for order inquiries
    order_data = generator.generate_examples("order_inquiry", 3) # Generate 3 examples for demo
    print("\n--- Generated Order Inquiry Examples ---")
    print(json.dumps(order_data, indent=2))

    # Generate data for return requests
    return_data = generator.generate_examples("return_request", 2) # Generate 2 examples
    print("\n--- Generated Return Request Examples ---")
    print(json.dumps(return_data, indent=2))

    # Generate data for technical support
    tech_data = generator.generate_examples("technical_support", 2) # Generate 2 examples
    print("\n--- Generated Technical Support Examples ---")
    print(json.dumps(tech_data, indent=2))

Generating 3 examples for category 'order_inquiry' using Groq model 'llama3-8b-8192'...
Generated example 1/3
Generated example 2/3
Generated example 3/3

--- Generated Order Inquiry Examples ---
[
  {
    "instruction": "When will order #XYZ987 be delivered?",
    "output": "Thank you for reaching out to us about your order, #XYZ987! I'm happy to help you track the status of your delivery.\n\nTo get the most up-to-date information, I'd like to ask you to please check the tracking number provided in your order confirmation email. You can simply copy and paste the tracking number into our website's tracking tool, and it will give you the latest updates on the status of your shipment.\n\nIf you don't have the tracking number handy, I can try to look it up for you. Can you please confirm your order number, #XYZ987, and I'll do my best to assist you?\n\nAdditionally, you can also check the tracking information on our website by logging into your account and clicking on the \"Order History\

In [13]:
# #4- Synthetic Data (LLM-Assisted Generation ) 
# Requirements:
#     1. Maintain same task type and quality
#     2. Vary the language, context, and complexity
#     3. Include edge cases and error scenarios
#     4. Keep responses factually accurate
    
#     Output format: JSON with 'input' and 'output' fields
#     """

In [14]:
#5 - Data Marketplaces

# Scale AI: Professional annotation services
# Appen: Crowd-sourced data labeling
# Lionbridge: Enterprise data solutions

In [15]:
import random
import os
import json
import re
import numpy as np
from collections import Counter
from groq import Groq

from groq import Groq # Import the Groq client
from dotenv import load_dotenv
env_path = r'E:\YTReusable\.env'
load_dotenv(env_path)
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

class SyntheticDataGenerator:
    def __init__(self):
        self.client = Groq()
        self.groq_model = "llama3-8b-8192"
        self.templates = {
            "order_inquiry": ["Where is my order #{order_id}?", "Can you track order #{order_id}?", "Order #{order_id} status please"],
            "return_request": ["I want to return {product}", "How do I return my {product}?", "Return policy for {product}"]
        }
        self.variables = {
            "order_id": ["12345", "67890", "ABC123"],
            "product": ["laptop", "phone", "headphones"]
        }

    def generate_examples(self, category, count=10):
        examples = []
        templates = self.templates.get(category, [])
        if not templates: return []
        for _ in range(count):
            template = random.choice(templates)
            for var, values in self.variables.items():
                if f"{{{var}}}" in template:
                    template = template.replace(f"{{{var}}}", random.choice(values))
            response = self._get_groq_response(template, category)
            examples.append({"instruction": template, "output": response, "category": category})
        return examples

    def _get_groq_response(self, question, category):
        system_prompt_map = {
            "order_inquiry": "You are a polite customer service agent. Provide order status.",
            "return_request": "You are a customer service agent handling returns. Explain the process."
        }
        system_prompt = system_prompt_map.get(category, "You are a helpful assistant.")
        try:
            chat_completion = self.client.chat.completions.create(
                messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": f"Customer query: \"{question}\" Provide a professional response."}],
                model=self.groq_model, temperature=0.7, max_tokens=100
            )
            return chat_completion.choices[0].message.content.strip()
        except Exception:
            return "Apologies, I'm unable to generate a response."

class DataCleaner:
    def clean_dataset(self, raw_data):
        unique_data = self._remove_duplicates(raw_data)
        valid_data = self._filter_invalid(unique_data)
        cleaned_text_data = self._clean_text(valid_data)
        safe_data = self._remove_pii(cleaned_text_data)
        return safe_data

    def _remove_duplicates(self, data):
        seen_hashes = set()
        unique = []
        for item in data:
            content_tuple = (item.get('instruction', ''), item.get('output', ''))
            item_hash = hash(content_tuple)
            if item_hash not in seen_hashes:
                seen_hashes.add(item_hash)
                unique.append(item)
        return unique

    def _filter_invalid(self, data, min_instr_len=10, min_output_len=5):
        valid = []
        for item in data:
            instr, output = item.get('instruction', ''), item.get('output', '')
            if len(instr.strip()) < min_instr_len or len(output.strip()) < min_output_len: continue
            if self._is_nonsensical(item): continue
            valid.append(item)
        return valid

    def _is_nonsensical(self, item):
        instr, output = item.get('instruction', '').lower(), item.get('output', '').lower()
        if "apologies, i'm unable to generate" in output or len(output.split()) < 3: return True
        if re.search(r'\{[a-zA-Z_]+\}', instr): return True # Unreplaced placeholders
        from difflib import SequenceMatcher
        if len(instr) > 20 and len(output) > 20 and SequenceMatcher(None, instr, output).ratio() > 0.95: return True
        return False

    def _clean_text(self, data):
        cleaned_data = []
        for item in data:
            item_copy = item.copy()
            item_copy['instruction'] = ' '.join(item_copy['instruction'].split()).strip()
            item_copy['output'] = ' '.join(item_copy['output'].split()).strip()
            item_copy['instruction'] = self._clean_special_chars(item_copy['instruction'])
            item_copy['output'] = self._clean_special_chars(item_copy['output'])
            cleaned_data.append(item_copy)
        return cleaned_data

    def _clean_special_chars(self, text):
        import string
        return re.sub(r'[^\w\s' + re.escape(string.punctuation) + ']', '', text)

    def _remove_pii(self, data):
        pii_patterns = {'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
                        'phone': r'\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'}
        safe_data = []
        for item in data:
            combined_text = item.get('instruction', '') + " " + item.get('output', '')
            if not any(re.search(pattern, combined_text, re.IGNORECASE) for pattern in pii_patterns.values()):
                safe_data.append(item)
        return safe_data

class QualityValidator:
    def __init__(self):
        self.quality_thresholds = {'min_input_length': 10, 'max_input_length': 2000,
                                   'min_output_length': 5, 'max_output_length': 1000,
                                   'max_duplicate_rate': 0.05, 'min_diversity_score': 0.1, 'min_dataset_size': 50}

    def validate_dataset(self, data):
        results = {'total_examples': len(data), 'validation_passed': True, 'issues': [], 'metrics': {}}
        if len(data) == 0:
            results['validation_passed'] = False; results['issues'].append("Dataset is empty.")
            self._generate_report(results); return results
        
        checks = [self._check_length_distribution, self._check_duplicates, self._check_diversity,
                  self._check_consistency, self._check_completeness]
        for check in checks:
            check_result = check(data)
            results['metrics'].update(check_result)
            if not check_result.get('passed', True):
                results['validation_passed'] = False; results['issues'].extend(check_result.get('issues', []))
        self._generate_report(results)
        return results

    def _check_length_distribution(self, data):
        il = [len(item['instruction']) for item in data]; ol = [len(item['output']) for item in data]
        res = {'avg_instruction_length': np.mean(il), 'min_instruction_length_actual': np.min(il),
               'avg_output_length': np.mean(ol), 'min_output_length_actual': np.min(ol), 'passed': True, 'issues': []}
        if res['min_instruction_length_actual'] < self.quality_thresholds['min_input_length']: res['passed'] = False; res['issues'].append('Avg input too short')
        if res['min_output_length_actual'] < self.quality_thresholds['min_output_length']: res['passed'] = False; res['issues'].append('Avg output too short')
        return res

    def _check_duplicates(self, data):
        unique_pairs = set(); duplicates = 0
        for item in data:
            pair = (item.get('instruction', ''), item.get('output', ''))
            if pair in unique_pairs: duplicates += 1
            else: unique_pairs.add(pair)
        rate = duplicates / len(data) if len(data) > 0 else 0
        return {'duplicate_rate': rate, 'passed': rate <= self.quality_thresholds['max_duplicate_rate'],
                'issues': ['High duplicate rate'] if rate > self.quality_thresholds['max_duplicate_rate'] else []}

    def _check_diversity(self, data):
        all_tokens = []; [all_tokens.extend((item.get('instruction','') + ' ' + item.get('output','')).lower().split()) for item in data]
        if not all_tokens: return {'diversity_score': 0.0, 'passed': False, 'issues': ['No tokens for diversity check']}
        score = len(set(all_tokens)) / len(all_tokens)
        return {'diversity_score': score, 'passed': score >= self.quality_thresholds['min_diversity_score'],
                'issues': ['Low diversity'] if score < self.quality_thresholds['min_diversity_score'] else []}

    def _check_consistency(self, data):
        issues = []; passed = True
        req_keys = ['instruction', 'output', 'category']; placeholder_regex = r'\{[a-zA-Z_]+\}'
        for i, item in enumerate(data):
            if not isinstance(item, dict): issues.append(f"Ex {i} not dict."); passed = False; continue
            for k in req_keys:
                if k not in item or not item[k] or not item[k].strip(): issues.append(f"Ex {i} empty '{k}'."); passed = False
            if re.search(placeholder_regex, item.get('instruction','')) or re.search(placeholder_regex, item.get('output','')):
                issues.append(f"Ex {i} unreplaced placeholders."); passed = False
        return {'consistency_issues_count': len(issues), 'passed': passed, 'issues': issues}

    def _check_completeness(self, data):
        passed = len(data) >= self.quality_thresholds['min_dataset_size']
        issues = [] if passed else [f"Dataset size ({len(data)}) below min ({self.quality_thresholds['min_dataset_size']})."]
        return {'dataset_size': len(data), 'passed': passed, 'issues': issues}

    def _generate_report(self, results):
        print("\n--- DATA QUALITY REPORT ---")
        print(f"Total Examples: {results['total_examples']}")
        print(f"Status: {'PASSED' if results['validation_passed'] else 'FAILED'}")
        if results['issues']: print("Issues:", "\n  ".join(results['issues']))
        for metric, value in results['metrics'].items():
            if isinstance(value, float): print(f"{metric}: {value:.3f}")
            else: print(f"{metric}: {value}")

class BiasDetector:
    def __init__(self):
        self.bias_categories = {
            'gender': {'male': ['he', 'his', 'man'], 'female': ['she', 'her', 'woman']},
            'race': {'black': ['black'], 'white': ['white']}
        }
        self.over_rep_threshold = 0.6
        self.under_rep_threshold = 0.05

    def detect_biases(self, data):
        bias_report = {}
        for category_name, sub_categories in self.bias_categories.items():
            individual_term_counts = {term: 0 for terms_list in sub_categories.values() for term in terms_list}
            total_category_mentions = 0
            for item in data:
                text = (item.get('instruction', '') + ' ' + item.get('output', '')).lower()
                for sub_cat_key, terms in sub_categories.items():
                    for term in terms:
                        matches = re.findall(r'\b' + re.escape(term) + r'\b', text)
                        individual_term_counts[term] += len(matches)
                        total_category_mentions += len(matches)

            sub_category_mentions = {sub_cat_key: sum(individual_term_counts[term] for term in terms)
                                     for sub_cat_key, terms in sub_categories.items()}
            distribution = {}
            bias_flags = []
            if total_category_mentions > 0:
                distribution = {sk: c / total_category_mentions for sk, c in sub_category_mentions.items()}
                for sk, perc in distribution.items():
                    if sub_category_mentions[sk] == 0: continue
                    if perc > self.over_rep_threshold: bias_flags.append(f"'{sk}' over-represented ({perc:.1%})")
                    elif perc < self.under_rep_threshold: bias_flags.append(f"'{sk}' under-represented ({perc:.1%})")
            else: bias_flags.append(f"No mentions in '{category_name}' category.")
            
            bias_report[category_name] = {'total_mentions': total_category_mentions, 'distribution': distribution, 'bias_flags': bias_flags}
        self._generate_report(bias_report)
        return bias_report

    def _generate_report(self, bias_report):
        print("\n--- BIAS DETECTION REPORT ---")
        overall_bias = False
        for category, results in bias_report.items():
            print(f"\n--- {category.upper()} BIAS ---")
            print(f"Total mentions: {results['total_mentions']}")
            if results['total_mentions'] > 0:
                print("Distribution:")
                for sub_cat, dist_pct in results['distribution'].items():
                    print(f"  - {sub_cat}: {dist_pct:.1%}")
                if results['bias_flags']:
                    overall_bias = True; print("BIAS FLAGS:", "\n  ".join(results['bias_flags']))
                else: print("No significant imbalance.")
            else: print("No terms found for this category.")
        print(f"\nOVERALL BIAS STATUS: {'POTENTIAL BIAS DETECTED' if overall_bias else 'NO SIGNIFICANT BIAS FLAGS'}")

class HumanValidationInterface:
    def __init__(self):
        self.approved = []
        self.rejected = []
        self.feedback = []
    
    def create_review_sample(self, data, sample_size=10): # Smaller default for quick demo
        return random.sample(data, min(sample_size, len(data)))
    
    def review_interface(self, sample_data):
        print("\n--- HUMAN REVIEW INTERFACE ---")
        print("Commands: 'a' = approve, 'r' = reject, 'f' = flag for revision, 'q' = quit")
        for i, item in enumerate(sample_data):
            print(f"\n--- Example {i+1}/{len(sample_data)} ---")
            print(f"Instruction: {item['instruction']}")
            print(f"Output: {item['output']}")
            while True:
                decision = input("Decision (a/r/f/q): ").lower()
                if decision == 'a': self.approved.append(item); break
                elif decision == 'r': reason = input("Rejection reason: "); self.rejected.append({**item, 'rejection_reason': reason}); break
                elif decision == 'f': feedback = input("Feedback for improvement: "); self.feedback.append({**item, 'feedback': feedback}); break
                elif decision == 'q': return self._get_review_summary()
                else: print("Invalid input.")
        return self._get_review_summary()
    
    def _get_review_summary(self):
        total_reviewed = len(self.approved) + len(self.rejected) + len(self.feedback)
        common_issues = Counter([item.get('rejection_reason') for item in self.rejected if 'rejection_reason' in item])
        return {
            'total_reviewed': total_reviewed,
            'approved': len(self.approved),
            'rejected': len(self.rejected),
            'flagged': len(self.feedback),
            'approval_rate': len(self.approved) / total_reviewed if total_reviewed > 0 else 0,
            'common_issues': common_issues.most_common(5) # Top 5 common issues
        }

# --- Main Pipeline Execution ---
if __name__ == "__main__":
    if not os.getenv("GROQ_API_KEY"):
        print("GROQ_API_KEY environment variable not set. Please set it to proceed.")
        # os.environ["GROQ_API_KEY"] = "YOUR_GROQ_API_KEY_HERE" # Uncomment for testing, not recommended for production
        exit()

    # 1. Generate Data
    gen = SyntheticDataGenerator()
    raw_data = gen.generate_examples("order_inquiry", 10) + gen.generate_examples("return_request", 10)
    # Add a few "bad" examples for demo
    raw_data.append({"instruction": "short", "output": "ok", "category": "bad"}); raw_data.append({"instruction": "test@example.com", "output": "test@example.com", "category": "pii"});
    raw_data.append({"instruction": "A man said hello.", "output": "He walked away.", "category": "bias_test"}) # Bias example

    print(f"\nGenerated {len(raw_data)} raw examples.")

    # 2. Clean Data
    cleaner = DataCleaner()
    cleaned_data = cleaner.clean_dataset(raw_data)
    print(f"Cleaned data size: {len(cleaned_data)}")

    # 3. Validate Quality
    validator = QualityValidator()
    validator.validate_dataset(cleaned_data)

    # 4. Detect Biases
    bias_detector = BiasDetector()
    bias_detector.detect_biases(cleaned_data)

    # 5. Human Validation (Interactive)
    human_val = HumanValidationInterface()
    sample_for_review = human_val.create_review_sample(cleaned_data, sample_size=5) # Review a small sample

    
    review_summary = human_val.review_interface(sample_for_review)
    print("\n--- HUMAN REVIEW SUMMARY ---")
    print(json.dumps(review_summary, indent=2))

    
    print("\n--- Human review step would be interactive here ---")
    print("Example: Reviewing 5 randomly selected data points.")
    print("If run interactively, results would show approval/rejection rates and common feedback.")

    #Final data ready for use (e.g., saving to file)
    with open("final_dataset.jsonl", "w") as f:
        for item in cleaned_data:
            f.write(json.dumps(item) + "\n")
    print(f"\nFinal cleaned data saved to final_dataset.jsonl ({len(cleaned_data)} examples)")


Generated 23 raw examples.
Cleaned data size: 21

--- DATA QUALITY REPORT ---
Total Examples: 21
Status: FAILED
Issues: Dataset size (21) below min (50).
avg_instruction_length: 25.286
min_instruction_length_actual: 17
avg_output_length: 455.048
min_output_length_actual: 15
passed: False
issues: ['Dataset size (21) below min (50).']
duplicate_rate: 0.000
diversity_score: 0.230
consistency_issues_count: 0
dataset_size: 21

--- BIAS DETECTION REPORT ---

--- GENDER BIAS ---
Total mentions: 2
Distribution:
  - male: 100.0%
  - female: 0.0%
BIAS FLAGS: 'male' over-represented (100.0%)

--- RACE BIAS ---
Total mentions: 0
No terms found for this category.

OVERALL BIAS STATUS: POTENTIAL BIAS DETECTED

--- HUMAN REVIEW INTERFACE ---
Commands: 'a' = approve, 'r' = reject, 'f' = flag for revision, 'q' = quit

--- Example 1/5 ---
Instruction: A man said hello.
Output: He walked away.

--- Example 2/5 ---
Instruction: How do I return my laptop?
Output: Thank you for reaching out to us about ret