In [1]:
"""
Clean and preprocess Student_performance_data.csv for BrightPath Academy.
Produces a cleaned dataset and a summary report with data inspection, missing values,
outlier treatment, and distributions.
"""

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import os

# Configuration
DATA_FILE = 'Student_performance_data .csv'
CLEANED_FILE = 'cleaned_student_performance_data.csv'
REPORT_FILE = 'initial_summary_report.txt'
PLOT_DIR = Path("plots")
IQR_MULTIPLIER = 1.5
NUMERIC_COLS = ["StudyTimeWeekly", "Absences", "GPA"]
CATEGORICAL_COLS = [
    "Gender",
    "Ethnicity",
    "ParentalEducation",
    "Tutoring",
    "ParentalSupport",
    "Extracurricular",
    "Sports",
    "Music",
    "Volunteering",
    "GradeClass",
]

# Debug: Print working directory and file paths
print(f"Current working directory: {os.getcwd()}")
print(f"Input file path: {Path(DATA_FILE).absolute()}")
print(f"Output file path: {Path(CLEANED_FILE).absolute()}")

def setup_environment() -> None:
    """Set up libraries and visualization settings."""
    try:
        sns.set_style("whitegrid")
        plt.rcParams["figure.figsize"] = (8, 6)
        PLOT_DIR.mkdir(exist_ok=True)
        print("Environment setup complete.")
    except Exception as e:
        print(f"Error setting up environment: {e}")
        raise

def load_and_inspect_data(file_path: str) -> pd.DataFrame:
    """
    Load dataset and perform initial inspection.
    
    Args:
        file_path (str): Path to CSV file.
        
    Returns:
        pd.DataFrame: Loaded dataset.
        
    Raises:
        FileNotFoundError: If CSV file is missing.
        Exception: For other loading errors.
    """
    try:
        data = pd.read_csv(file_path)
        print("First 5 rows of the dataset:")
        print(data.head())
        print("\nData Info:")
        print(data.info())
        print("\nBasic Statistics:")
        print(data.describe())
        
        with open(REPORT_FILE, "w") as report:
            report.write("Initial Summary Report\n" + "=" * 50 + "\n\n")
            report.write("First 5 Rows:\n" + str(data.head()) + "\n\n")
            report.write("Data Info:\n")
            data.info(buf=report)
            report.write("\n\nBasic Statistics:\n" + str(data.describe()) + "\n\n")
        
        return data
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure it exists in {os.getcwd()}")
        raise
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def handle_missing_values(
    data: pd.DataFrame, numeric_cols: list, categorical_cols: list
) -> pd.DataFrame:
    """
    Check and impute missing values.
    
    Args:
        data (pd.DataFrame): Input dataset.
        numeric_cols (list): Numeric column names.
        categorical_cols (list): Categorical column names.
        
    Returns:
        pd.DataFrame: Dataset with imputed values.
    """
    try:
        print("\nMissing Values:")
        missing = data.isnull().sum()
        print(missing)
        
        imputation_log = []
        for col in numeric_cols:
            if missing[col] > 0:
                median_value = data[col].median()
                data[col] = data[col].fillna(median_value)
                imputation_log.append(f"Imputed {col} with median: {median_value}")
        
        for col in categorical_cols:
            if missing[col] > 0:
                mode_value = data[col].mode()[0]
                data[col] = data[col].fillna(mode_value)
                imputation_log.append(f"Imputed {col} with mode: {mode_value}")
        
        if imputation_log:
            print("Imputation Summary:")
            for log in imputation_log:
                print(log)
        
        print("\nMissing Values After Imputation:")
        print(data.isnull().sum())
        
        with open(REPORT_FILE, "a") as report:
            report.write("Missing Values Check:\n" + str(missing) + "\n")
            if imputation_log:
                report.write("Imputation Summary:\n" + "\n".join(imputation_log) + "\n")
            report.write("\nMissing Values After:\n" + str(data.isnull().sum()) + "\n\n")
        
        return data
    except Exception as e:
        print(f"Error handling missing values: {e}")
        raise

def cap_outliers(series: pd.Series, multiplier: float = 1.5) -> pd.Series:
    """
    Cap outliers using IQR method.
    
    Args:
        series (pd.Series): Input column.
        multiplier (float): IQR multiplier for bounds.
        
    Returns:
        pd.Series: Capped column.
    """
    try:
        Q1 = series.quantile(0.25)
        Q3 = series.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - multiplier * IQR
        upper_bound = Q3 + multiplier * IQR
        return series.clip(lower_bound, upper_bound)
    except Exception as e:
        print(f"Error capping outliers: {e}")
        raise

def plot_boxplots(
    data: pd.DataFrame, cols: list, title: str, filename: str
) -> None:
    """
    Generate and save boxplots.
    
    Args:
        data (pd.DataFrame): Input dataset.
        cols (list): Columns to plot.
        title (str): Plot title.
        filename (str): Output file name.
    """
    try:
        n_cols = len(cols)
        fig, axes = plt.subplots(1, n_cols, figsize=(4 * n_cols, 4))
        if n_cols == 1:
            axes = [axes]
        for ax, col in zip(axes, cols):
            sns.boxplot(y=data[col], ax=ax)
            ax.set_title(f"{col}")
        fig.suptitle(title)
        plt.tight_layout()
        plt.savefig(PLOT_DIR / filename)
        plt.close()
        print(f"Saved plot: {PLOT_DIR / filename}")
    except Exception as e:
        print(f"Error plotting boxplots: {e}")
        raise

def plot_distributions(
    data: pd.DataFrame, cols: list, title: str, filename: str
) -> None:
    """
    Generate and save histograms with KDE.
    
    Args:
        data (pd.DataFrame): Input dataset.
        cols (list): Columns to plot.
        title (str): Plot title.
        filename (str): Output file name.
    """
    try:
        n_cols = len(cols)
        n_rows = (n_cols + 1) // 2
        fig, axes = plt.subplots(n_rows, min(n_cols, 2), figsize=(8, 4 * n_rows))
        axes = axes.flatten() if n_cols > 1 else [axes]
        for ax, col in zip(axes, cols):
            sns.histplot(data[col], kde=True, ax=ax)
            ax.set_title(f"{col}")
        if n_cols % 2:
            axes[-1].axis("off")
        fig.suptitle(title)
        plt.tight_layout()
        plt.savefig(PLOT_DIR / filename)
        plt.close()
        print(f"Saved plot: {PLOT_DIR / filename}")
    except Exception as e:
        print(f"Error plotting distributions: {e}")
        raise

def plot_categorical(
    data: pd.DataFrame, cols: list, title: str, filename: str
) -> None:
    """
    Generate and save count plots.
    
    Args:
        data (pd.DataFrame): Input dataset.
        cols (list): Columns to plot.
        title (str): Plot title.
        filename (str): Output file name.
    """
    try:
        n_cols = len(cols)
        n_rows = (n_cols + 1) // 2
        fig, axes = plt.subplots(n_rows, min(n_cols, 2), figsize=(8, 4 * n_rows))
        axes = axes.flatten() if n_cols > 1 else [axes]
        for ax, col in zip(axes, cols):
            sns.countplot(x=data[col], ax=ax)
            ax.set_title(f"{col}")
        if n_cols % 2:
            axes[-1].axis("off")
        fig.suptitle(title)
        plt.tight_layout()
        plt.savefig(PLOT_DIR / filename)
        plt.close()
        print(f"Saved plot: {PLOT_DIR / filename}")
    except Exception as e:
        print(f"Error plotting categorical: {e}")
        raise

def main():
    """Main function to clean and preprocess dataset."""
    try:
        setup_environment()
        
        # Load and inspect
        data = load_and_inspect_data(DATA_FILE)
        
        # Handle missing values
        data = handle_missing_values(data, NUMERIC_COLS, CATEGORICAL_COLS)
        
        # Handle outliers
        plot_boxplots(data, NUMERIC_COLS, "Boxplots Before Outlier Treatment", "outliers_before.png")
        for col in NUMERIC_COLS:
            data[col] = cap_outliers(data[col], IQR_MULTIPLIER)
            print(f"Outliers capped for {col}")
        plot_boxplots(data, NUMERIC_COLS, "Boxplots After Outlier Treatment", "outliers_after.png")
        
        with open(REPORT_FILE, "a") as report:
            report.write("Outlier Treatment:\n")
            report.write(f"Applied IQR-based capping (multiplier={IQR_MULTIPLIER}) to {', '.join(NUMERIC_COLS)}.\n")
            report.write("Visualizations saved as plots/outliers_before.png and plots/outliers_after.png.\n\n")
        
        # Plot distributions
        plot_distributions(data, NUMERIC_COLS, "Numeric Distributions", "distributions.png")
        selected_categorical = ["GradeClass", "Gender", "Ethnicity", "ParentalSupport"]
        plot_categorical(data, selected_categorical, "Categorical Distributions", "categorical_distributions.png")
        
        with open(REPORT_FILE, "a") as report:
            report.write("Distributions:\n")
            report.write(f"Numeric: {', '.join(NUMERIC_COLS)} saved in plots/distributions.png.\n")
            report.write(f"Categorical: {', '.join(selected_categorical)} saved in plots/categorical_distributions.png.\n\n")
        
        # Save cleaned dataset
        data.to_csv(CLEANED_FILE, index=False)
        print(f"Cleaned dataset saved as {CLEANED_FILE}")
        print(f"Saved at: {Path(CLEANED_FILE).absolute()}")
        
        # Finalize report
        with open(REPORT_FILE, "a") as report:
            report.write("Conclusion:\n")
            report.write(f"Dataset cleaned and saved as {CLEANED_FILE}.\n")
            report.write("All missing values handled and outliers capped.\n")
            report.write("Visualizations and summaries provided for further analysis.\n")
        
        print(f"Initial summary report saved as {REPORT_FILE}")
        print(f"Saved at: {Path(REPORT_FILE).absolute()}")
    except Exception as e:
        print(f"Error in main execution: {e}")
        raise

if __name__ == "__main__":
    main()


Current working directory: C:\Users\demic\Desktop\Project1
Input file path: C:\Users\demic\Desktop\Project1\Student_performance_data .csv
Output file path: C:\Users\demic\Desktop\Project1\cleaned_student_performance_data.csv
Environment setup complete.
First 5 rows of the dataset:
   StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
0       1001   17       1          0                  2        19.833723   
1       1002   18       0          0                  1        15.408756   
2       1003   15       0          2                  3         4.210570   
3       1004   17       1          0                  3        10.028829   
4       1005   17       1          0                  2         4.672495   

   Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
0         7         1                2                0       0      1   
1         0         0                1                0       0      0   
2        26         0                2 