In [74]:
# ================================================================
# TASK 1: IMPORTS & FILE LIST (process ALL datasets)
# ================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

FILES = ["student-mat.csv", "student-por.csv"]

In [1]:
# ================================================================
# TASK 2: SMART CSV LOADER (detect separators + fix 1-column issue)
# ================================================================

def smart_read(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        first = f.readline()

    for sep in [";", ",", "\t", "|"]:
        if sep in first:
            df = pd.read_csv(path, sep=sep)
            break
    else:
        df = pd.read_csv(path)

    # Handle one-column files
    if df.shape[1] == 1:
        raw = df.iloc[:, 0].astype(str)
        if ";" in raw.iloc[0]:
            df = raw.str.split(";", expand=True)
        elif "," in raw.iloc[0]:
            df = raw.str.split(",", expand=True)

        df.columns = df.iloc[0]
        df = df[1:].reset_index(drop=True)

    return df


In [76]:
# ============================================
# TASK 3â€“10: FULL LOOP (runs for BOTH datasets)
# ============================================
for FILE in FILES:

    print("\n============================================")
    print("PROCESSING:", FILE)
    print("============================================")

    df = smart_read(FILE)
    print("Loaded shape:", df.shape)

    # Clean strings
    df.columns = [c.strip() for c in df.columns]
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = (
            df[col].astype(str)
            .str.strip()
            .replace({"": np.nan, "NA": np.nan, "na": np.nan, "?" : np.nan})
        )

    # Convert numerics
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            pass

    # Handle missing values
    num_cols = df.select_dtypes(include=[np.number]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns

    for c in num_cols:
        df[c] = df[c].fillna(df[c].median())
    for c in cat_cols:
        df[c] = df[c].fillna("missing")

    # Target: pass/fail
    if "G3" in df.columns:
        df["passed"] = (df["G3"] >= 10).astype(int)

    # Save cleaned file
    out_name = f"cleaned_{FILE.replace('.csv','')}.csv"
    df.to_csv(out_name, index=False)
    print("[SAVED]", out_name)


PROCESSING: student-mat.csv
Loaded shape: (395, 33)
[SAVED] cleaned_student-mat.csv

PROCESSING: student-por.csv
Loaded shape: (649, 33)
[SAVED] cleaned_student-por.csv
