# Data Cleaning + 5-Chart EDA (Template)

**Purpose:** Fast, standardized delivery for Fiverr clients.  
**Inputs:** A CSV/XLSX file path.  
**Outputs:** Cleaned dataset, 5 charts (PNGs), and a short insights summary.


In [1]:

import os, pandas as pd, numpy as np, matplotlib.pyplot as plt
from datetime import datetime
plt.rcParams['figure.figsize'] = (8,4)

RAW_PATH = "ecommerce_raw.csv"      # replace with client file
CLEAN_PATH = "ecommerce_clean.csv"
CHARTS_DIR = "charts"
os.makedirs(CHARTS_DIR, exist_ok=True)

def save_chart(fig, name):
    fig.tight_layout()
    fig.savefig(os.path.join(CHARTS_DIR, name), dpi=160, bbox_inches="tight")
    plt.close(fig)


In [2]:

# 1) Load and basic cleaning (adjust rules as needed)
df = pd.read_csv(RAW_PATH)

# Standardize whitespace/case for object columns
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()

# Try parsing any column that looks like a date
for c in df.columns:
    if any(k in c.lower() for k in ["date","time","at"]):
        df[c] = pd.to_datetime(df[c], errors="coerce")

# Numeric coercion
for c in df.columns:
    if df[c].dtype == 'object':
        try:
            df[c] = pd.to_numeric(df[c], errors='ignore')
        except Exception:
            pass

# Null handling (simple defaults; customize per project)
df = df.drop_duplicates()
# Optional: fill numeric nulls with median
for c in df.select_dtypes(include=[np.number]).columns:
    df[c] = df[c].fillna(df[c].median())

df.to_csv(CLEAN_PATH, index=False)
print("Saved cleaned dataset to", CLEAN_PATH)


Saved cleaned dataset to ecommerce_clean.csv


In [3]:

# 2) Generate 5 charts (fallback selections if not specified by client)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) >= 2:
    x, y = numeric_cols[:2]
else:
    # synthesize numeric index if needed
    df["__row__"] = range(len(df))
    numeric_cols = ["__row__"] + numeric_cols
    x, y = numeric_cols[:2]

# Chart 1: Histogram of first numeric
fig, ax = plt.subplots()
ax.hist(df[numeric_cols[0]].dropna(), bins=40)
ax.set_title(f"Distribution of {numeric_cols[0]}")
ax.set_xlabel(numeric_cols[0]); ax.set_ylabel("Count")
save_chart(fig, "1_hist.png")

# Chart 2: Scatter of first two numeric
fig, ax = plt.subplots()
ax.scatter(df[x], df[y], alpha=0.6)
ax.set_title(f"{x} vs {y}")
ax.set_xlabel(x); ax.set_ylabel(y)
save_chart(fig, "2_scatter.png")

# Chart 3: Boxplot of first numeric by top category (if any)
cat_cols = df.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns.tolist()
if cat_cols:
    cat = cat_cols[0]
    # keep only top 10 categories
    top10 = df[cat].value_counts().index[:10]
    fig, ax = plt.subplots()
    df[df[cat].isin(top10)].boxplot(column=numeric_cols[0], by=cat, rot=45, ax=ax)
    ax.set_title(f"{numeric_cols[0]} by {cat}")
    ax.set_xlabel(cat); ax.set_ylabel(numeric_cols[0])
    plt.suptitle("")
    save_chart(fig, "3_box_by_cat.png")

# Chart 4: Time series if any datetime column exists
date_cols = df.select_dtypes(include=['datetime64[ns]']).columns.tolist()
if date_cols and numeric_cols:
    tcol = date_cols[0]
    ts = df[[tcol, numeric_cols[0]]].dropna()
    ts = ts.groupby(pd.Grouper(key=tcol, freq='D'))[numeric_cols[0]].sum().reset_index()
    fig, ax = plt.subplots()
    ax.plot(ts[tcol], ts[numeric_cols[0]])
    ax.set_title(f"Daily {numeric_cols[0]}")
    ax.set_xlabel("Date"); ax.set_ylabel(numeric_cols[0])
    save_chart(fig, "4_timeseries.png")

# Chart 5: Correlation heatmap (if >=3 numeric)
if len(numeric_cols) >= 3:
    corr = df[numeric_cols].corr(numeric_only=True)
    fig, ax = plt.subplots()
    cax = ax.imshow(corr.values, interpolation='nearest')
    ax.set_title("Correlation Heatmap")
    ax.set_xticks(range(len(corr.columns))); ax.set_xticklabels(corr.columns, rotation=90)
    ax.set_yticks(range(len(corr.columns))); ax.set_yticklabels(corr.columns)
    fig.colorbar(cax)
    save_chart(fig, "5_corr_heatmap.png")


TypeError: corr() got an unexpected keyword argument 'numeric_only'


### Delivery Notes (copy into your PDF summary)
- **Data health**: duplicates removed; numeric nulls filled with median (customize as needed).
- **Key patterns**: (write 3–5 bullet points referencing the charts)
- **Outliers/Warnings**: (mention anomalies, missing ranges, unexpected spikes)
- **Next steps**: (what simple analysis/modeling could help their decision)
