In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(r"C:\\Users\\rashm\\NARESH_IT\\Dataset\\bank.csv", sep=';')
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [2]:

BASE_DIR = Path("Automated_EDA_Output")

SUBFOLDERS = {
    "data": BASE_DIR / "data",
    "plots": BASE_DIR / "plots",
    "tables": BASE_DIR / "tables",
    "stats": BASE_DIR / "statistics",
    "reports": BASE_DIR / "reports"
}

for folder in SUBFOLDERS.values():
    folder.mkdir(parents=True, exist_ok=True)


In [3]:

cat = df.select_dtypes(include=["object", "category"]).columns.tolist()
num = df.select_dtypes(include=["int64", "float64"]).columns.tolist()



In [4]:

df.to_csv(SUBFOLDERS["data"] / "complete_dataset.csv", index=False)
df[num].to_csv(SUBFOLDERS["data"] / "numerical_data.csv", index=False)
df[cat].to_csv(SUBFOLDERS["tables"] / "categorical_data.csv", index=False)



In [5]:
summary = pd.DataFrame({
    "mean": df[num].mean(),
    "median": df[num].median(),
    "mode": df[num].mode().iloc[0],
    "min": df[num].min(),
    "max": df[num].max(),
    "std_dev": df[num].std()
})
summary.to_csv(SUBFOLDERS["stats"] / "summary_statistics.csv")



In [6]:
plt.style.use("seaborn-v0_8-darkgrid")


for col in num:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(SUBFOLDERS["plots"] / f"{col}_distribution.png")
    plt.close()


In [7]:
for col in cat:
    if df[col].nunique() > 20:
        continue
    plt.figure(figsize=(8, 5))
    df[col].value_counts().plot(kind="bar")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(SUBFOLDERS["plots"] / f"{col}_distribution.png")
    plt.close()



In [8]:

report_path = SUBFOLDERS["reports"] / "Bank_report.txt"

with open(report_path, "w") as f:
    f.write(
        f"""AUTOMATED EDA REPORT
{"="*40}

Rows: {df.shape[0]}
Columns: {df.shape[1]}

Numerical Columns: {len(num)}
Categorical Columns: {len(cat)}

Missing Values Per Column:
"""
    )

    for col, count in df.isnull().sum().items():
        f.write(f"{col}: {count}\\n")

    f.write(
        """
Insights:
- Numerical features show varied distributions
- Categorical features have dominant categories
- Dataset ready for preprocessing and modeling
"""
    )

print("Automated EDA completed successfully.")
print(f"All outputs saved in: {BASE_DIR}")

Automated EDA completed successfully.
All outputs saved in: Automated_EDA_Output
