In [1]:
import pandas as pd
import numpy as np
import itertools

# Load the dataset
csv_file_path = 'autism_prediction_dataset.csv'
data = pd.read_csv(csv_file_path)

# Define the sections with the number of questions in each
sections = {
    'A1a': 8,
    'A1b': 4,
    'A1c': 3,
    'A2a': 7,
    'A2b': 1,
    'A2c': 1,
    'A2d': 4
}

# Define the expected responses for each question
expected_responses = data['Expected_Response'].tolist()

# Function to check conditions and predict autism
def predict_autism(response_combination):
    section_fulfilled = {}
    start = 0
    for section, count in sections.items():
        section_responses = response_combination[start:start+count]
        expected_section_responses = expected_responses[start:start+count]
        section_fulfilled[section] = any([section_responses[i] == expected_section_responses[i] for i in range(count)])
        start += count
    
    social_communication_fulfilled = all([section_fulfilled[sec] for sec in ['A1a', 'A1b', 'A1c']])
    restrictive_behavior_fulfilled = sum([section_fulfilled[sec] for sec in ['A2a', 'A2b', 'A2c', 'A2d']]) >= 2
    
    if social_communication_fulfilled and restrictive_behavior_fulfilled:
        return 'Autism'
    else:
        return 'No Autism'

# Number of samples to generate
num_samples = 2500000  # Adjust this as needed

# Randomly sample from all possible combinations
np.random.seed(42)  # For reproducibility
combinations = [np.random.choice(['yes', 'no'], size=28) for _ in range(num_samples)]

# Create the DataFrame
columns = [f'Q{i+1}' for i in range(28)] + ['Prediction']
data = [list(comb) + [predict_autism(comb)] for comb in combinations]

df = pd.DataFrame(data, columns=columns)

# Save to CSV
output_file_path = 'autism_combinations_predictions_sampled1.csv'
df.to_csv(output_file_path, index=False)

print(f'Dataset saved to {output_file_path}')


Dataset saved to autism_combinations_predictions_sampled1.csv
