In [6]:
# STEP 0: DATA GENERATION WITH CORRELATED REALISTIC DISTRIBUTIONS
import pandas as pd
import numpy as np
import random

np.random.seed(42)
n = 2000

first_names = ['Aarav', 'Vivaan', 'Aditya', 'Sai', 'Arjun', 'Anaya', 'Diya', 'Myra', 'Ishita', 'Kiara']
last_names = ['Sharma', 'Verma', 'Singh', 'Gupta', 'Rao', 'Kumar', 'Jain', 'Mehta', 'Patel', 'Reddy']
student_names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(n)]
student_ids = [f"S{random.choice(['DS', 'AI', 'ML', 'CS'])}{str(i).zfill(4)}" for i in range(1, n+1)]

# Gender: majority Male/Female, few Others
genders = np.random.choice(['Male', 'Female', 'Other'], size=n, p=[0.48, 0.48, 0.04])

# Base GPA from normal distribution
mu_gpa = 6.5
sigma_gpa = 1.2
gpa_vals = np.clip(np.random.normal(mu_gpa, sigma_gpa, n), 2.0, 10.0)

# Attendance: positively correlated with GPA (with some noise)
attendance = np.clip(gpa_vals * 10 + np.random.normal(0, 10, n), 40, 100)

# Study Hours: positively correlated with GPA (more spread)
study_hours = np.clip(gpa_vals * 2.5 + np.random.normal(0, 5, n), 0, 40)

# Participation: correlated with GPA and attendance
participation = np.clip((gpa_vals * 10 + attendance * 0.3 + np.random.normal(0, 10, n)) / 2, 0, 100)

# Scores: strongly correlated with GPA
assignment_scores = np.clip(gpa_vals * 10 + np.random.normal(0, 10, n), 0, 100)
midterm_scores = np.clip(gpa_vals * 10 + np.random.normal(0, 12, n), 0, 100)
final_scores = np.clip(gpa_vals * 10 + np.random.normal(0, 15, n), 0, 100)

# Internet access: more likely for higher study hours
internet = np.where(study_hours + np.random.normal(0, 5, n) > 12, 'Yes', 'No')

# Part-time job: more likely for students with lower study hours
part_time = np.where(study_hours + np.random.normal(0, 5, n) < 10, 'Yes', 'No')

# Dropout risk: low GPA, attendance, and study hours increase risk
risk_mask = (gpa_vals < 4.5) & (attendance < 60) & (study_hours < 5)
dropout_risk = np.where(risk_mask, 'Yes', 'No')

# Final DataFrame
df = pd.DataFrame({
    'Student_ID': student_ids,
    'Student_Name': student_names,
    'Gender': genders,
    'Age': np.random.randint(17, 26, n),
    'Attendance (%)': attendance.round(),
    'GPA (last semester)': np.round(gpa_vals, 2),
    'Study_Hours/week': study_hours.round(1),
    'Participation (%)': participation.round(),
    'Assignments_Score': assignment_scores.round(),
    'Midterm_Score': midterm_scores.round(),
    'Final_Score': final_scores.round(),
    'Internet_Access': internet,
    'Part_time_Job': part_time,
    'Dropout_Risk': dropout_risk
})

# Export to CSV
df.to_csv("student_performance_2000.csv", index=False)
print("Correlated and realistic dataset generated and saved as student_cleaned_for_excel.csv")


Correlated and realistic dataset generated and saved as student_cleaned_for_excel.csv
