In [3]:
import pandas as pd
import os

# Step 1: Load Dataset
print("📥 Loading dataset...")
file_path = '../data/raw/StudentsPerformance.csv'
df = pd.read_csv(file_path)
print(f"✅ Dataset loaded successfully with shape: {df.shape}")

# Step 2: Clean Column Names
print("\n🧼 Cleaning column names...")
df.columns = df.columns.str.lower().str.replace(" ", "_").str.strip()
df.rename(columns={
    'parental_level_of_education': 'parent_edu',
    'test_preparation_course': 'prep_course'
}, inplace=True)
print("✅ Columns cleaned and standardized.\n")

# Step 3: Dataset Overview
print("📊 Dataset Overview:")
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}\n")

print("🔍 Column Data Types:")
print(df.dtypes)

print("\n🕳️ Missing Values:")
print(df.isnull().sum())

# Step 4: Add Derived Features
print("\n➕ Adding derived features...")

# Calculate average score
df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)

# Categorize performance levels
df['performance'] = pd.cut(df['average_score'],
                           bins=[0, 60, 70, 80, 90, 100],
                           labels=['Poor', 'Average', 'Good', 'Very Good', 'Excellent'])

print("✅ Features 'average_score' and 'performance' added.")

# Step 5: Save Cleaned Dataset
print("\n💾 Saving cleaned dataset...")
os.makedirs('../data/cleaned', exist_ok=True)
df.to_csv('../data/cleaned/students_cleaned.csv', index=False)
print("✅ Cleaned dataset saved to '../data/cleaned/students_cleaned.csv'")

# Step 6: Summary Statistics
print("\n📊 Performance Category Counts:")
print(df['performance'].value_counts())

print("\n📈 Descriptive Statistics:")
print(df.describe(include='all'))


📥 Loading dataset...
✅ Dataset loaded successfully with shape: (1000, 8)

🧼 Cleaning column names...
✅ Columns cleaned and standardized.

📊 Dataset Overview:
Total Rows: 1000
Total Columns: 8

🔍 Column Data Types:
gender            object
race/ethnicity    object
parent_edu        object
lunch             object
prep_course       object
math_score         int64
reading_score      int64
writing_score      int64
dtype: object

🕳️ Missing Values:
gender            0
race/ethnicity    0
parent_edu        0
lunch             0
prep_course       0
math_score        0
reading_score     0
writing_score     0
dtype: int64

➕ Adding derived features...
✅ Features 'average_score' and 'performance' added.

💾 Saving cleaned dataset...
✅ Cleaned dataset saved to '../data/cleaned/students_cleaned.csv'

📊 Performance Category Counts:
performance
Poor         293
Average      260
Good         253
Very Good    144
Excellent     50
Name: count, dtype: int64

📈 Descriptive Statistics:
        gender race/