In [5]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------

# Update this path to the location of your autos.csv
DATA_PATH = "/Users/nilufarkurbonova/Desktop/autos.csv"
OUTPUT_DIR = "auto"
SNAPSHOT_YEAR = 2016  # all cars are observed in 2016

# ---------------------------------------------------------------------
# Helper functions


def ensure_output_dir():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    return OUTPUT_DIR


def load_raw_data(path: str = None, encoding: str = "latin-1") -> pd.DataFrame:
    path = path or DATA_PATH
    return pd.read_csv(path, encoding=encoding)


def save_cleaned(df: pd.DataFrame, name: str = "autos_clean.csv") -> str:
    ensure_output_dir()
    out_path = os.path.join(OUTPUT_DIR, name)
    df.to_csv(out_path, index=False)
    return out_path


def summary(df: pd.DataFrame) -> str:
    lines = []
    lines.append(f"Rows: {len(df):,}, Cols: {df.shape[1]}")
    lines.append("Columns: " + ", ".join(df.columns))

    na = df.isna().sum()
    if (na > 0).any():
        lines.append("Missing values:")
        for c, n in na[na > 0].sort_values(ascending=False).items():
            lines.append(f"  {c}: {n:,}")
    else:
        lines.append("Missing values: none")

    return "\n".join(lines)


# ---------------------------------------------------------------------
# Cleaning
# ---------------------------------------------------------------------

def clean_autos(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # keep only the columns we actually use
    cols = [
        "name",
        "price",
        "vehicleType",
        "yearOfRegistration",
        "gearbox",
        "powerPS",
        "model",
        "kilometer",
        "monthOfRegistration",
        "fuelType",
        "brand",
        "notRepairedDamage",
    ]
    df = df[cols]

    # convert to numeric where needed
    for c in ["price", "powerPS", "kilometer", "yearOfRegistration", "monthOfRegistration"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # basic plausibility filters
    df = df[(df["yearOfRegistration"] >= 1950) & (df["yearOfRegistration"] <= SNAPSHOT_YEAR)]
    df = df[(df["price"] >= 100) & (df["price"] <= 100_000)]
    df = df[(df["powerPS"] >= 10) & (df["powerPS"] <= 1000)]

    # normalize German labels etc.
    repl = {
        "manuell": "manual",
        "automatik": "automatic",
        "benzin": "petrol",
        "diesel": "diesel",
        "elektro": "electric",
        "hybrid": "hybrid",
        "ja": "yes",
        "nein": "no",
    }
    for c in ["gearbox", "fuelType", "notRepairedDamage", "vehicleType", "brand", "model"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip().str.lower().replace(repl)

    # derived features
    df["car_age"] = SNAPSHOT_YEAR - df["yearOfRegistration"]
    df["km_per_year"] = (df["kilometer"] / np.maximum(df["car_age"], 1)).round(0)

    # drop rows missing core numeric fields
    df = df.dropna(subset=["price", "car_age", "powerPS", "kilometer"])

    return df


# ---------------------------------------------------------------------
# Plotting utilities
# ---------------------------------------------------------------------

def plot_hist(df, col, bins=50):
    ensure_output_dir()
    plt.figure()
    df[col].dropna().plot(kind="hist", bins=bins, edgecolor="black")
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    out = os.path.join(OUTPUT_DIR, f"hist_{col}.png")
    plt.savefig(out, bbox_inches="tight")
    plt.close()
    return out


def plot_top_counts(df, col, topn=15):
    ensure_output_dir()
    plt.figure()
    df[col].value_counts().head(topn).sort_values().plot(kind="barh")
    plt.title(f"Top {topn} {col}")
    plt.xlabel("Count")
    out = os.path.join(OUTPUT_DIR, f"top_{col}.png")
    plt.savefig(out, bbox_inches="tight")
    plt.close()
    return out


def plot_correlation_heatmap(df):
    ensure_output_dir()
    corr = df[["price", "car_age", "kilometer", "powerPS"]].corr()

    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.imshow(corr, cmap="coolwarm", vmin=-1, vmax=1)

    ax.set_xticks(np.arange(len(corr.columns)))
    ax.set_yticks(np.arange(len(corr.index)))
    ax.set_xticklabels(corr.columns, rotation=45, ha="right")
    ax.set_yticklabels(corr.index)

    for i in range(len(corr.index)):
        for j in range(len(corr.columns)):
            ax.text(j, i, f"{corr.iloc[i, j]:.2f}",
                    ha="center", va="center", color="white")

    cbar = plt.colorbar(im, ax=ax)
    cbar.set_label("Correlation")

    ax.set_title("Correlation Heatmap")
    plt.tight_layout()

    out = os.path.join(OUTPUT_DIR, "correlation_heatmap.png")
    plt.savefig(out)
    plt.close()
    return out


def plot_boxplot(df, by_col, topn=10):
    """
    Boxplot of price by a categorical variable.
    Limited to the top N most frequent categories for readability.
    """
    ensure_output_dir()
    cats = df[by_col].value_counts().head(topn).index.tolist()
    groups = [df.loc[df[by_col] == cat, "price"] for cat in cats]

    plt.figure(figsize=(8, 6))
    plt.boxplot(groups, labels=cats)
    plt.title(f"Price Distribution by {by_col} (top {topn} categories)")
    plt.xlabel(by_col)
    plt.ylabel("Price (€)")
    plt.xticks(rotation=45)

    out = os.path.join(OUTPUT_DIR, f"boxplot_price_by_{by_col}.png")
    plt.tight_layout()
    plt.savefig(out)
    plt.close()
    return out


def grouped_avg(df, by):
    """
    Average price by a categorical feature; saves CSV and returns both path and Series.
    """
    ensure_output_dir()
    agg = df.groupby(by)["price"].mean().sort_values(ascending=False)
    out_path = os.path.join(OUTPUT_DIR, f"avg_price_by_{by}.csv")
    agg.to_csv(out_path, header=["avg_price"])
    return out_path, agg


def scatter_plot(df, x, y="price", sample=5000):
    """
    Simple scatter plot, optionally sampling to keep the image light.
    """
    ensure_output_dir()
    pdf = df[[x, y]].dropna()
    if len(pdf) > sample:
        pdf = pdf.sample(sample, random_state=42)

    plt.figure()
    plt.scatter(pdf[x], pdf[y], s=5, alpha=0.5)
    plt.title(f"{y} vs {x}")
    plt.xlabel(x)
    plt.ylabel(y)
    out = os.path.join(OUTPUT_DIR, f"scatter_{y}_vs_{x}.png")
    plt.savefig(out, bbox_inches="tight")
    plt.close()
    return out


# ---------------------------------------------------------------------
# Forecasting helpers (depreciation over age)
# ---------------------------------------------------------------------

def predict_future_price(model, car_age, kilometer, powerPS):
    """
    Predict price for a given combination of car_age, kilometer, and powerPS
    using the trained linear regression model.
    """
    X_new = pd.DataFrame(
        {
            "car_age": [car_age],
            "kilometer": [kilometer],
            "powerPS": [powerPS],
        }
    )
    return float(model.predict(X_new)[0])


def plot_future_depreciation(model,
                             km_per_year: float,
                             powerPS: float,
                             max_age: int = 20):
    """
    Forecast depreciation curve for a 'typical' car:
    - km_per_year: assumed annual mileage
    - powerPS: assumed engine power
    - x-axis: car age in years (0..max_age)
    """
    ensure_output_dir()

    ages = list(range(0, max_age + 1))
    kilometers = [age * km_per_year for age in ages]

    X_new = pd.DataFrame(
        {
            "car_age": ages,
            "kilometer": kilometers,
            "powerPS": [powerPS] * len(ages),
        }
    )
    preds = model.predict(X_new)
    preds = np.maximum(preds, 0)  # avoid negative prices in the plot

    # save underlying data for your report
    forecast_df = pd.DataFrame(
        {
            "age_years": ages,
            "kilometers": kilometers,
            "predicted_price_eur": preds,
        }
    )
    data_path = os.path.join(OUTPUT_DIR, "depreciation_forecast_data.csv")
    forecast_df.to_csv(data_path, index=False)

    # plot
    plt.figure(figsize=(8, 6))
    plt.plot(ages, preds, marker="o")
    plt.title(
        "Predicted Depreciation Over Time\n"
        f"(powerPS ≈ {powerPS:.0f}, km/year ≈ {km_per_year:.0f})"
    )
    plt.xlabel("Car Age (years)")
    plt.ylabel("Predicted Price (€)")
    plt.grid(True)

    out = os.path.join(OUTPUT_DIR, "predicted_depreciation_curve.png")
    plt.savefig(out, bbox_inches="tight")
    plt.close()

    return out, data_path


# ---------------------------------------------------------------------
# Model training
# ---------------------------------------------------------------------

def simple_linear_model(df: pd.DataFrame):
    """
    Train a simple linear model:
        price ~ car_age + kilometer + powerPS

    Also creates an interpretable depreciation forecast using
    median km/year and median power from the dataset.
    """
    ensure_output_dir()

    use_cols = ["car_age", "kilometer", "powerPS"]
    X = df[use_cols].fillna(0)
    y = df["price"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # metrics
    metrics = {
        "r2": float(r2_score(y_test, y_pred)),
        "mae": float(mean_absolute_error(y_test, y_pred)),
        "coefficients": dict(zip(use_cols, model.coef_.round(2))),
        "intercept": float(model.intercept_),
    }

    # choose a "typical" car for the forecast
    typical_km_per_year = float(df["km_per_year"].median())
    typical_powerPS = float(df["powerPS"].median())

    dep_plot_path, dep_data_path = plot_future_depreciation(
        model,
        km_per_year=typical_km_per_year,
        powerPS=typical_powerPS,
        max_age=20,
    )

    # add forecast info to metrics
    metrics["typical_km_per_year_for_forecast"] = typical_km_per_year
    metrics["typical_powerPS_for_forecast"] = typical_powerPS
    metrics["depreciation_curve_image"] = dep_plot_path
    metrics["depreciation_curve_data"] = dep_data_path

    # save metrics
    metrics_path = os.path.join(OUTPUT_DIR, "linear_model_metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=2)

    return model, metrics


# ---------------------------------------------------------------------
# Text summary for your report
# ---------------------------------------------------------------------

def write_summary():
    ensure_output_dir()
    lines = []
    lines.append("USED CAR PRICE ANALYSIS — SUMMARY\n")

    # Top brands
    brand_csv = os.path.join(OUTPUT_DIR, "avg_price_by_brand.csv")
    if os.path.exists(brand_csv):
        brand = pd.read_csv(brand_csv)
        top = brand.sort_values("avg_price", ascending=False).head(5)
        lines.append("Top 5 brands by average price:")
        for _, row in top.iterrows():
            lines.append(f" - {row.iloc[0]}: €{row['avg_price']:.0f}")
        lines.append("")

    # Linear model metrics
    metrics_path = os.path.join(OUTPUT_DIR, "linear_model_metrics.json")
    if os.path.exists(metrics_path):
        with open(metrics_path, "r") as f:
            metrics = json.load(f)

        lines.append("Simple linear model (price ~ car_age + kilometer + powerPS):")
        lines.append(f" - R^2: {metrics['r2']:.3f}")
        lines.append(f" - MAE: €{metrics['mae']:.0f}")
        lines.append(f" - Coefficients: {metrics['coefficients']}")
        lines.append(
            f" - Forecast uses median km/year ≈ "
            f"{metrics['typical_km_per_year_for_forecast']:.0f} "
            f"and median powerPS ≈ {metrics['typical_powerPS_for_forecast']:.0f}"
        )
        lines.append("")

    lines.append("Key Takeaways:")
    lines.append("1) Brand, fuel type, and gearbox show clear differences in average price.")
    lines.append("2) Price decreases with car age and higher mileage; higher powerPS increases price.")
    lines.append("3) The correlation heatmap shows price has strong negative correlation "
                 "with age and km, and a positive correlation with powerPS.")
    lines.append("4) The simple linear model captures the main trends; "
                 "the depreciation forecast shows how an average car loses value each year.")
    lines.append("5) For very old ages, the linear model can predict unrealistically low prices, "
                 "which is a normal limitation of linear extrapolation.")

    out_path = os.path.join(OUTPUT_DIR, "summary.txt")
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

    print(f"Saved summary -> {out_path}")


# ---------------------------------------------------------------------
# Main pipeline
# ---------------------------------------------------------------------

def main():
    ensure_output_dir()

    print("Loading raw data...")
    df_raw = load_raw_data()
    print("Raw dataset summary:")
    print(summary(df_raw))

    print("\nCleaning...")
    df = clean_autos(df_raw)
    print("Cleaned dataset summary:")
    print(summary(df))

    save_cleaned(df)
    print("\nSaved cleaned dataset.")

    print("\nEDA: saving figures...")
    for c in ["price", "car_age", "kilometer", "powerPS", "km_per_year"]:
        print("Saved", plot_hist(df, c))

    for c in ["brand", "model", "fuelType", "gearbox", "vehicleType"]:
        if c in df.columns:
            print("Saved", plot_top_counts(df, c))

    print("\nCorrelation heatmap...")
    print("Saved", plot_correlation_heatmap(df))

    print("\nBoxplots for categorical analysis...")
    for c in ["fuelType", "gearbox"]:
        if c in df.columns:
            print("Saved", plot_boxplot(df, c))

    print("\nGroup-by averages and CSVs...")
    for by in ["brand", "fuelType", "gearbox", "vehicleType"]:
        if by in df.columns:
            path, agg = grouped_avg(df, by)
            print(f"Saved avg price by {by} -> {path}")

    print("\nScatter plots...")
    for x in ["car_age", "kilometer", "powerPS", "km_per_year"]:
        print("Saved", scatter_plot(df, x))

    print("\nTraining simple linear model and creating depreciation forecast...")
    model, metrics = simple_linear_model(df)
    print("Linear model metrics:", metrics)

    print("\nWriting summary...")
    write_summary()

    print("\nAll done. See the outputs/ folder for figures, CSVs, and summary.")


if __name__ == "__main__":
    main()


Loading raw data...
Raw dataset summary:
Rows: 371,528, Cols: 20
Columns: dateCrawled, name, seller, offerType, price, abtest, vehicleType, yearOfRegistration, gearbox, powerPS, model, kilometer, monthOfRegistration, fuelType, brand, notRepairedDamage, dateCreated, nrOfPictures, postalCode, lastSeen
Missing values:
  notRepairedDamage: 72,060
  vehicleType: 37,869
  fuelType: 33,386
  model: 20,484
  gearbox: 20,209

Cleaning...
Cleaned dataset summary:
Rows: 311,286, Cols: 14
Columns: name, price, vehicleType, yearOfRegistration, gearbox, powerPS, model, kilometer, monthOfRegistration, fuelType, brand, notRepairedDamage, car_age, km_per_year
Missing values: none

Saved cleaned dataset.

EDA: saving figures...
Saved auto/hist_price.png
Saved auto/hist_car_age.png
Saved auto/hist_kilometer.png
Saved auto/hist_powerPS.png
Saved auto/hist_km_per_year.png
Saved auto/top_brand.png
Saved auto/top_model.png
Saved auto/top_fuelType.png
Saved auto/top_gearbox.png
Saved auto/top_vehicleType.png


  plt.boxplot(groups, labels=cats)


Saved auto/boxplot_price_by_fuelType.png


  plt.boxplot(groups, labels=cats)


Saved auto/boxplot_price_by_gearbox.png

Group-by averages and CSVs...
Saved avg price by brand -> auto/avg_price_by_brand.csv
Saved avg price by fuelType -> auto/avg_price_by_fuelType.csv
Saved avg price by gearbox -> auto/avg_price_by_gearbox.csv
Saved avg price by vehicleType -> auto/avg_price_by_vehicleType.csv

Scatter plots...
Saved auto/scatter_price_vs_car_age.png
Saved auto/scatter_price_vs_kilometer.png
Saved auto/scatter_price_vs_powerPS.png
Saved auto/scatter_price_vs_km_per_year.png

Training simple linear model and creating depreciation forecast...
Linear model metrics: {'r2': 0.5605464652161918, 'mae': 3114.8309252496074, 'coefficients': {'car_age': -204.47, 'kilometer': -0.08, 'powerPS': 67.45}, 'intercept': 9728.017090282028, 'typical_km_per_year_for_forecast': 10000.0, 'typical_powerPS_for_forecast': 116.0, 'depreciation_curve_image': 'auto/predicted_depreciation_curve.png', 'depreciation_curve_data': 'auto/depreciation_forecast_data.csv'}

Writing summary...
Saved su