In [None]:
import csv
import random

# The number of sample users you want to generate
NUM_ROWS = 25000

# This is our "Expert Logic" mapping RIASEC codes to career clusters
CAREER_MAP = {
    "RI": {"cluster": "Engineering & Tech", "stream": "Science (PCM)"},
    "IR": {"cluster": "Engineering & Tech", "stream": "Science (PCM)"},
    "IS": {"cluster": "Science & Medical", "stream": "Science (PCB)"},
    "SI": {"cluster": "Science & Medical", "stream": "Science (PCB)"},
    "RS": {"cluster": "Healthcare Support", "stream": "Science (PCB)"},
    "AS": {"cluster": "Arts & Humanities", "stream": "Arts"},
    "SA": {"cluster": "Education & Social Work", "stream": "Arts"},
    "EC": {"cluster": "Business & Finance", "stream": "Commerce"},
    "CE": {"cluster": "Business & Finance", "stream": "Commerce"},
    "RC": {"cluster": "Skilled Trades & Ops", "stream": "Vocational"},
    "CR": {"cluster": "Skilled Trades & Ops", "stream": "Vocational"},
    # Fallbacks for single-letter codes
    "R": {"cluster": "Skilled Trades & Ops", "stream": "Vocational"},
    "I": {"cluster": "Engineering & Tech", "stream": "Science (PCM)"},
    "A": {"cluster": "Arts & Humanities", "stream": "Arts"},
    "S": {"cluster": "Education & Social Work", "stream": "Arts"},
    "E": {"cluster": "Business & Finance", "stream": "Commerce"},
    "C": {"cluster": "Business & Finance", "stream": "Commerce"},
}

def generate_dataset(filename="dataset.csv"):
    """Generates the synthetic dataset and saves it to a CSV file."""
    header = [f"Q{i+1}" for i in range(12)] + ["CareerCluster"]
    
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)

        for _ in range(NUM_ROWS):
            # Generate 12 random answers (scale of 1-5)
            answers = [random.randint(1, 5) for _ in range(12)]
            
            # Calculate scores for each RIASEC type
            scores = {
                'R': answers[0] + answers[1],
                'I': answers[2] + answers[3],
                'A': answers[4] + answers[5],
                'S': answers[6] + answers[7],
                'E': answers[8] + answers[9],
                'C': answers[10] + answers[11]
            }
            
            # Get the top two RIASEC codes
            sorted_codes = sorted(scores, key=scores.get, reverse=True)
            top_two_code = sorted_codes[0] + sorted_codes[1]
            primary_code = sorted_codes[0]

            # Determine the career cluster using our expert map
            if top_two_code in CAREER_MAP:
                career_cluster = CAREER_MAP[top_two_code]["cluster"]
            else:
                career_cluster = CAREER_MAP[primary_code]["cluster"]

            # Write the row to the CSV file
            writer.writerow(answers + [career_cluster])
            
    print(f"Successfully generated '{filename}' with {NUM_ROWS} rows.")


if __name__ == "__main__":
    generate_dataset()

Successfully generated 'dataset.csv' with 5000 rows.
