In [32]:
import pandas as pd
import numpy as np
from faker import Faker
import json

In [33]:
def create_expert_validated_stress_dataset(num_students=200):
    """
    Create synthetic student dataset with stress-related features (excluding stress level)
    """
    np.random.seed(42)
    fake = Faker()
    
    # Research-based parameter ranges (from academic literature)
    PARAMETER_RANGES = {
        'gpa': (2.0, 4.0, 3.2, 0.5),  # min, max, mean, std
        'weekly_study_hours': (10, 50, 25, 8),
        'sleep_hours': (4, 10, 6.5, 1.2),
        'social_events': (0, 7, 2, 1.5),
        'exercise_hours': (0, 15, 3, 2)
    }
    
    data = []
    
    for i in range(num_students):
        # Generate research-backed features
        gpa = np.random.normal(PARAMETER_RANGES['gpa'][2], PARAMETER_RANGES['gpa'][3])
        gpa = np.clip(gpa, PARAMETER_RANGES['gpa'][0], PARAMETER_RANGES['gpa'][1])
        
        study_hours = np.random.normal(PARAMETER_RANGES['weekly_study_hours'][2], 
                                     PARAMETER_RANGES['weekly_study_hours'][3])
        study_hours = np.clip(study_hours, PARAMETER_RANGES['weekly_study_hours'][0], 
                            PARAMETER_RANGES['weekly_study_hours'][1])
        
        sleep_hours = np.random.normal(PARAMETER_RANGES['sleep_hours'][2], 
                                     PARAMETER_RANGES['sleep_hours'][3])
        sleep_hours = np.clip(sleep_hours, PARAMETER_RANGES['sleep_hours'][0], 
                            PARAMETER_RANGES['sleep_hours'][1])
        
        # Generate realistic correlations
        # Students with high study hours tend to have less social time
        social_events = max(0, PARAMETER_RANGES['social_events'][2] - 
                          (study_hours - 25) / 10 + np.random.normal(0, 1))
        social_events = np.clip(social_events, PARAMETER_RANGES['social_events'][0], 
                              PARAMETER_RANGES['social_events'][1])
        
        # Financial stress (realistic distribution)
        financial_stress = np.random.beta(2, 5) * 4 + 1  # Skewed toward lower financial stress
        
        # Work-life balance factors
        part_time_job = np.random.choice([0, 1], p=[0.7, 0.3])
        commute_time = np.random.exponential(0.4)  # Realistic commute distribution
        
        data.append({
            'student_id': f"STU_{i+1:04d}",
            'name': fake.name(),
            'gpa': round(gpa, 2),
            'weekly_study_hours': round(study_hours, 1),
            'sleep_hours_per_night': round(sleep_hours, 1),
            'social_events_per_week': round(social_events, 1),
            'exercise_hours_per_week': round(np.random.exponential(3), 1),
            'part_time_job_hours': part_time_job * np.random.randint(10, 25),
            'commute_time_minutes': round(commute_time * 60),
            'financial_stress_level': round(financial_stress, 1),
            'assignments_due_this_week': np.random.poisson(2.5),
            'classes_per_week': np.random.randint(12, 25),
            'extracurricular_hours': round(np.random.exponential(2), 1)
        })
    
    return pd.DataFrame(data)

In [34]:
def validate_dataset_quality(df):
    """
    Comprehensive validation of synthetic dataset quality
    """
    validation_results = {}
    
    # 1. Range validation
    validation_results['ranges'] = {
        'gpa_valid': str(df['gpa'].min() >= 2.0 and df['gpa'].max() <= 4.0),
        'sleep_hours_valid': str(df['sleep_hours_per_night'].min() >= 4.0 and df['sleep_hours_per_night'].max() <= 10.0),
        'study_hours_valid': str(df['weekly_study_hours'].min() >= 10 and df['weekly_study_hours'].max() <= 50)
    }
    
    # 2. Distribution validation
    validation_results['distributions'] = {
        'gpa_normal': str(abs(df['gpa'].skew()) < 1.0),
        'realistic_study_hours': str(df['weekly_study_hours'].std() > 5.0),  # Good variability
    }
    
    # 3. Logical consistency checks
    validation_results['logic'] = {
        'study_sleep_tradeoff': str(df['weekly_study_hours'].corr(df['sleep_hours_per_night']) < 0.3),
        'realistic_correlations': str(abs(df['gpa'].corr(df['weekly_study_hours'])) < 0.6),  # Some but not perfect correlation
    }
    
    # 4. Missing data check
    validation_results['completeness'] = {
        'no_missing_values': str(not df.isnull().any().any()),
        'consistent_data_types': str(all(df.dtypes != object))  # All numerical except names/IDs
    }
    
    # Add numerical metrics for better analysis
    validation_results['metrics'] = {
        'gpa_skew': float(df['gpa'].skew()),
        'study_hours_std': float(df['weekly_study_hours'].std()),
        'study_sleep_correlation': float(df['weekly_study_hours'].corr(df['sleep_hours_per_night'])),
        'gpa_study_correlation': float(df['gpa'].corr(df['weekly_study_hours']))
    }
    
    return validation_results

In [35]:
def generate_dataset_documentation(df, validation_results):
    """
    Create comprehensive documentation for the synthetic dataset
    """
    documentation = {
        'purpose': 'Synthetic dataset of student lifestyle and academic factors for educational research',
        'generation_method': 'Research-informed synthetic data generation based on established educational studies',
        'ethical_considerations': [
            'No real student data used',
            'All identifiers are synthetic',
            'Based on aggregate research findings rather than individual data',
            'Intended for educational research and model development only'
        ],
        'data_characteristics': {
            'total_students': len(df),
            'feature_count': len(df.columns) - 2,  # Excluding identifiers
            'feature_summary': {
                'gpa': {
                    'min': float(df['gpa'].min()),
                    'max': float(df['gpa'].max()),
                    'mean': float(df['gpa'].mean())
                },
                'weekly_study_hours': {
                    'min': float(df['weekly_study_hours'].min()),
                    'max': float(df['weekly_study_hours'].max()),
                    'mean': float(df['weekly_study_hours'].mean())
                }
            },
            'data_quality_validation': validation_results
        },
        'recommended_use_cases': [
            'Educational research prototyping',
            'Student behavior analysis',
            'Academic performance studies',
            'Lifestyle factor correlations'
        ],
        'limitations': [
            'Synthetic data may not capture all real-world complexities',
            'Based on generalized research findings',
            'Should be validated with real data before production use'
        ],
        'feature_descriptions': {
            'student_id': 'Unique synthetic identifier for each student',
            'name': 'Synthetic name generated using Faker library',
            'gpa': 'Grade Point Average (2.0-4.0 scale)',
            'weekly_study_hours': 'Total hours spent studying per week',
            'sleep_hours_per_night': 'Average hours of sleep per night',
            'social_events_per_week': 'Number of social activities attended weekly',
            'exercise_hours_per_week': 'Hours of physical exercise per week',
            'part_time_job_hours': 'Hours worked at part-time job (0 if no job)',
            'commute_time_minutes': 'Daily commute time in minutes',
            'financial_stress_level': 'Self-reported financial stress (1-5 scale)',
            'assignments_due_this_week': 'Number of assignments due in current week',
            'classes_per_week': 'Total hours of classes per week',
            'extracurricular_hours': 'Hours spent on extracurricular activities'
        }
    }
    return documentation

# Create and validate the dataset
print("Creating synthetic student lifestyle dataset...")
df = create_expert_validated_stress_dataset(300)
validation_results = validate_dataset_quality(df)
documentation = generate_dataset_documentation(df, validation_results)

print("Dataset Validation Results:")
print("=" * 50)
for category, checks in validation_results.items():
    if category != 'metrics':  # Skip metrics for this summary
        print(f"\n{category.upper()}:")
        for check, passed in checks.items():
            status = "PASS" if passed.lower() == 'true' else "FAIL"
            print(f"  {check}: {status}")

print(f"\nDataset created with {len(df)} records")
print(f"Features included: {list(df.columns)}")
print(f"Average GPA: {df['gpa'].mean():.2f}")
print(f"Average study hours: {df['weekly_study_hours'].mean():.1f}")

# Save dataset
df.to_csv('student_lifestyle_dataset.csv', index=False)
print("\nDataset saved as 'student_lifestyle_dataset.csv'")

# Save documentation
with open('dataset_documentation.json', 'w') as f:
    json.dump(documentation, f, indent=2)
print("Documentation saved as 'dataset_documentation.json'")

# Display sample of the data
print("\nSample of the dataset:")
print(df.head(10))

# Basic statistics
print("\nDataset Statistics:")
print("=" * 50)
print(df.describe())

# Correlation analysis (useful for understanding relationships)
print("\nFeature Correlations:")
print("=" * 50)
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
print(correlation_matrix.round(2))

Creating synthetic student lifestyle dataset...
Dataset Validation Results:

RANGES:
  gpa_valid: PASS
  sleep_hours_valid: PASS
  study_hours_valid: PASS

DISTRIBUTIONS:
  gpa_normal: PASS
  realistic_study_hours: PASS

LOGIC:
  study_sleep_tradeoff: PASS
  realistic_correlations: PASS

COMPLETENESS:
  no_missing_values: PASS
  consistent_data_types: FAIL

Dataset created with 300 records
Features included: ['student_id', 'name', 'gpa', 'weekly_study_hours', 'sleep_hours_per_night', 'social_events_per_week', 'exercise_hours_per_week', 'part_time_job_hours', 'commute_time_minutes', 'financial_stress_level', 'assignments_due_this_week', 'classes_per_week', 'extracurricular_hours']
Average GPA: 3.17
Average study hours: 25.5

Dataset saved as 'student_lifestyle_dataset.csv'
Documentation saved as 'dataset_documentation.json'

Sample of the dataset:
  student_id               name   gpa  weekly_study_hours  \
0   STU_0001       Mary Trevino  3.45                23.9   
1   STU_0002      H