In [16]:
import pandas as pd

df1 = pd.read_csv("/content/sample_data/AI Query Classification Dataset Collection - Research Project  (Responses) - Form responses 1.csv")

# Remove the 'timestamp' column
df1 = df1.drop(columns=['Timestamp'])

# Display the modified dataframe
df1 = df1.rename(columns={"Enter prompt / query": "Query", "Most suitable label": "Label"})

In [17]:
import random

# Define query categories with varying lengths
categories = {
    "text classification": [
        "Does this comment contain offensive language?",
        "Categorize this news article: Politics, Sports, or Entertainment.",
        "Identify whether this review is positive or negative.",
        "Classify the sentiment of this social media post: Happy, Angry, or Neutral.",
        "Analyze this tweet and determine if it's promoting misinformation about COVID-19.",
        "Is this customer email a complaint or a general inquiry?",
        "Based on text content, determine if this email is spam or not."
    ],
    "text generation": [
        "Generate a short email apologizing for a delayed response.",
        "Write a two-paragraph explanation of how machine learning works.",
        "Create a bedtime story about a little robot who learns to dream.",
        "Generate a formal job acceptance letter with gratitude.",
        "Write a blog introduction for an article on the impact of AI on education.",
        "Compose a promotional email for a new fitness app.",
        "Generate a LinkedIn post about the future of remote work."
    ],
    "code generation": [
        "Write a Python function to check if a number is prime.",
        "Generate a JavaScript function to validate an email address.",
        "Create a basic HTML + CSS template for a personal portfolio website.",
        "Write a SQL query to find duplicate entries in a database table.",
        "Implement a recursive Fibonacci function in C++.",
        "Generate a Flask API endpoint that returns user data in JSON format.",
        "Write a Python script that scrapes trending GitHub repositories."
    ],
    "summarization": [
        "Summarize the impact of climate change in one sentence.",
        "Provide a brief summary of the latest iPhone release and its key features.",
        "Summarize this news article into 3 bullet points.",
        "Condense this 10-page report on cybersecurity threats into key takeaways.",
        "Explain the main ideas of the book '1984' in 50 words.",
        "Summarize a recent scientific study about artificial intelligence applications.",
        "Write a concise summary of a movie review about 'Inception'."
    ],
    "question answering": [
        "Who discovered penicillin and how?",
        "What are the main causes of inflation in modern economies?",
        "How do black holes form and why are they important in astrophysics?",
        "Explain how deep learning differs from traditional machine learning.",
        "What are the key benefits of learning a second language?",
        "Describe the process of DNA replication in simple terms.",
        "Why do some people experience jet lag while traveling?"
    ],
    "image generation": [
        "Create a photorealistic image of a futuristic city skyline at night.",
        "Generate a fantasy-themed artwork of a warrior standing on a battlefield.",
        "Design an abstract digital painting inspired by ocean waves.",
        "Create a 3D-rendered model of a cyberpunk-style sports car.",
        "Generate an artistic portrait of an astronaut in a neon-lit space station.",
        "Create a landscape painting of a waterfall in a dense rainforest.",
        "Design a surreal digital image where the sky is filled with floating islands."
    ],
    "image classification": [
        "Is this image a cat or a dog?",
        "Detect if this photo is of a healthy or diseased plant leaf.",
        "Classify these satellite images into Urban, Forest, or Desert categories.",
        "Determine if this security camera footage contains human movement.",
        "Identify the breed of this dog based on the given image.",
        "Is this an X-ray scan of a broken bone or a normal one?",
        "Classify this set of images into 'Daytime' or 'Nighttime' scenes."
    ]
}

# Expanding dataset by adding variations in length and structure
dataset = []
for category, queries in categories.items():
    for query in queries:
        dataset.append((query, category))

# Adding more variations with different query lengths
for _ in range(300):  # Adding 300 extra queries
    cat = random.choice(list(categories.keys()))
    base_query = random.choice(categories[cat])

    # Introduce variations in length
    length_variations = [
        base_query,  # Original
        f"{base_query} Can you help with this?",  # Slightly longer
        f"Please provide an answer to this: {base_query}",  # Medium
        f"I need detailed information regarding the following topic: {base_query}. Please explain thoroughly with supporting examples if possible.",  # Long
        f"{base_query}. Keep the response short and precise.",  # Shortened
    ]

    dataset.append((random.choice(length_variations), cat))

# Creating DataFrame
df2 = pd.DataFrame(dataset, columns=["Query", "Label"])

In [19]:
# Merging both the dataframes together before exporting
result = pd.concat([df1, df2])


# Saving to CSV
result.to_csv("query_classification_dataset.csv", index=False)

print("Dataset saved as query_classification_dataset.csv")

Dataset saved as query_classification_dataset.csv
