In [None]:
#################### CONCATENATE ALL FILES EXCEL ##########################
import os
import pandas as pd

# Folder containing your AHJ Excel files
folder = 'clean'
files = [f for f in os.listdir(folder) if f.endswith('.csv')]

# Track results
valid_dfs = []
missing_clean = []
load_errors = []
header_mismatches = []

# Check all files
for file in files:
    path = os.path.join(folder, file)
    try:
        xls = pd.ExcelFile(path)
        if 'Clean' not in xls.sheet_names:
            missing_clean.append(file)
            continue

        df = pd.read_excel(path, sheet_name='Clean')

        df_cols = set(df.columns)
        std_cols = set(standard_columns)

        if df_cols == std_cols:
            # Reorder and store
            df = df[standard_columns]
            valid_dfs.append(df)
        else:
            # Find differences
            missing = list(std_cols - df_cols)
            extra = list(df_cols - std_cols)
            header_mismatches.append({
                'file': file,
                'missing': missing,
                'extra': extra
            })

    except Exception as e:
        load_errors.append((file, str(e)))

# ✅ Merge valid files
if valid_dfs:
    combined_df = pd.concat(valid_dfs, ignore_index=True)
    combined_df.to_csv('Clean_merged.csv', index=False)
    print(f"\n✅ Successfully merged {len(valid_dfs)} files into 'Clean_merged.xlsx' with {len(combined_df)} rows.")
else:
    print("\n⚠️ No valid files found for merging.")

# ⚠️ Report mismatches
if header_mismatches:
    print("\n⚠️ Files with missing or extra columns:")
    for entry in header_mismatches:
        print(f" - {entry['file']}")
        if entry['missing']:
            print(f"   Missing: {entry['missing']}")
        if entry['extra']:
            print(f"   Extra:   {entry['extra']}")

# ⚠️ Files missing Clean sheet
if missing_clean:
    print("\n⚠️ Files missing 'Clean' sheet:")
    for f in missing_clean:
        print(f" - {f}")

# ❌ Load errors
if load_errors:
    print("\n❌ Files that failed to load:")
    for f, err in load_errors:
        print(f" - {f}: {err}")

In [None]:
####################### STANDARIZE DATA FORMAT FINAL FILE #######################

# CSV version of the file
file = 'Clean_merged.csv'  # Replace with your file name

try:
    df = pd.read_csv(file)

    # 🔍 Auto-detect columns with 'date' in the name
    date_cols = [col for col in df.columns if 'date' in col.lower()]

    for col in date_cols:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce').dt.strftime('%Y-%m-%d')
            print(f"✅ Standardized column: {col}")
        except Exception as e:
            print(f"⚠️ Could not convert column {col}: {e}")

    # Save cleaned CSV
    df.to_csv('Clean_test_output.csv', index=False)
    print("📁 Saved cleaned file as 'Clean_test_output.csv'")

except Exception as e:
    print(f"❌ Error processing file: {e}")


In [None]:
################ Summary ####################

# Path to your file
file_path = "Clean_test_output.csv"

# Load dataset
df = pd.read_csv(file_path)

# Generate summary
print("🔍 DATASET SUMMARY")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}\n")

print("📌 Column names:")
print(df.columns.tolist(), "\n")

print("📊 Data types:")
print(df.dtypes, "\n")

print("❓ Missing values per column:")
print(df.isnull().sum(), "\n")

print("🔢 Sample data:")
print(df.head(), "\n")

print("🧩 Unique values (up to 5 per column):")
for col in df.columns:
    uniques = df[col].dropna().unique()[:5]
    print(f"{col}: {uniques}")
