In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
n_students = 1000

# Generate features
def generate_dataset():
    data = {
        'student_id': range(1, n_students + 1),
        'age': np.random.randint(18, 30, n_students),
        'gender': np.random.choice(['Male', 'Female'], n_students),
        'previous_education': np.random.choice(['High School', 'Associate', 'Bachelor'], n_students),
        'study_hours': np.random.uniform(0, 10, n_students),
        'sleep_hours': np.random.uniform(4, 10, n_students),
        'attendance_rate': np.random.uniform(0.5, 1, n_students),
        'family_income': np.random.choice(['Low', 'Medium', 'High'], n_students),
        'parent_education': np.random.choice(['Primary', 'Secondary', 'Higher'], n_students),
        'extracurricular_activities': np.random.choice(['Yes', 'No'], n_students),
        'study_group': np.random.choice(['Yes', 'No'], n_students),
        'stress_level': np.random.randint(1, 11, n_students),
        'online_courses': np.random.randint(0, 5, n_students),
        'internet_access': np.random.choice(['Yes', 'No'], n_students),
        'travel_time': np.random.uniform(0.1, 2, n_students)  # in hours
    }

    # Create DataFrame
    df = pd.DataFrame(data)

    # Generate performance score (target variable)
    features_for_performance = ['age', 'study_hours', 'sleep_hours', 'attendance_rate', 'stress_level', 'online_courses', 'travel_time']
    X = df[features_for_performance]
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Generate performance with some randomness
    performance = 70 + 10 * X_scaled.sum(axis=1) + np.random.normal(0, 5, n_students)
    
    # Clip performance to be between 0 and 100
    df['performance'] = np.clip(performance, 0, 100)

    # Categorize performance
    df['performance_category'] = pd.cut(df['performance'], 
                                        bins=[0, 60, 70, 80, 90, 100], 
                                        labels=['F', 'D', 'C', 'B', 'A'])

    return df

# Generate the dataset
student_data = generate_dataset()

# Save to CSV
student_data.to_csv('student_data.csv', index=False)

print("Dataset generated and saved as 'student_data.csv'")
print(student_data.head())
print(student_data.info())

Dataset generated and saved as 'student_data.csv'
   student_id  age  gender previous_education  study_hours  sleep_hours  \
0           1   24  Female        High School     4.394050     9.510559   
1           2   21  Female        High School     6.129396     9.705572   
2           3   28  Female           Bachelor     9.430758     7.462736   
3           4   25    Male          Associate     2.406927     6.142727   
4           5   22  Female        High School     1.215014     8.725295   

   attendance_rate family_income parent_education extracurricular_activities  \
0         0.737798        Medium          Primary                         No   
1         0.703874           Low           Higher                         No   
2         0.580609          High           Higher                        Yes   
3         0.828205           Low        Secondary                         No   
4         0.985575          High        Secondary                         No   

  study_group  str