In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
n_students = 1000

# Generate features
def generate_dataset():
    data = {
        'student_id': range(1, n_students + 1),
        'age': np.random.randint(18, 30, n_students),
        'gender': np.random.choice(['Male', 'Female'], n_students),
        'previous_education': np.random.choice(['High School', 'Associate', 'Bachelor'], n_students),
        'study_hours': np.random.uniform(1, 10, n_students),
        'sleep_hours': np.random.uniform(6, 9, n_students),
        'attendance_rate': np.random.uniform(0.7, 1, n_students),
        'family_income': np.random.choice(['Low', 'Medium', 'High'], n_students),
        'parent_education': np.random.choice(['Primary', 'Secondary', 'Higher'], n_students),
        'extracurricular_activities': np.random.choice(['Yes', 'No'], n_students),
        'study_group': np.random.choice(['Yes', 'No'], n_students),
        'stress_level': np.random.randint(1, 6, n_students),
        'online_courses': np.random.randint(0, 5, n_students),
        'internet_access': np.random.choice(['Yes', 'No'], n_students),
        'travel_time': np.random.uniform(0.1, 1, n_students)  # in hours
    }

    # Create DataFrame
    df = pd.DataFrame(data)

    # Introduce interaction terms
    df['study_sleep_interaction'] = df['study_hours'] * df['sleep_hours']
    df['attendance_study_interaction'] = df['attendance_rate'] * df['study_hours']
    df['stress_online_interaction'] = df['stress_level'] * df['online_courses']

    # Generate performance score (target variable)
    features_for_performance = ['age', 'study_hours', 'sleep_hours', 'attendance_rate', 'stress_level', 'online_courses', 'travel_time', 'study_sleep_interaction', 'attendance_study_interaction', 'stress_online_interaction']
    X = df[features_for_performance]
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Generate performance with minimal randomness
    performance = 70 + 20 * X_scaled.sum(axis=1)
    
    # Clip performance to be between 0 and 100
    df['performance'] = np.clip(performance, 0, 100)

    # Categorize performance
    df['performance_category'] = pd.cut(df['performance'], 
                                        bins=[0, 60, 70, 80, 90, 100], 
                                        labels=['F', 'D', 'C', 'B', 'A'])

    return df

# Generate the dataset
student_data = generate_dataset()

# Save to CSV
student_data.to_csv('student_data.csv', index=False)

print("Dataset generated and saved as 'student_data.csv'")
print(student_data.head())
print(student_data.info())
