### Imports + logging

In [15]:
import logging
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


from feature_engineering import split_data, build_feature_pipeline, FeatureConfig
logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(name)s | %(message)s")

### Load Ames Housing

In [16]:
AMES_PATH = "data/AmesHousing.csv"  # if you're using your local project folder

df_ames = pd.read_csv(AMES_PATH)
df_ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


### Train/test split

In [17]:
TARGET = "SalePrice"

# "Order" and "PID" are identifiers (not useful predictive features)
X_train, X_test, y_train, y_test = split_data(
    df_ames,
    target=TARGET,
    test_size=0.2,
    random_state=42,
    stratify=False,          # regression -> no stratify
    drop_cols=["Order", "PID"]
)

config = FeatureConfig(
    numeric_impute_strategy="median",
    categorical_impute_strategy="most_frequent",
    scale_numeric=True,
    one_hot_encode=True
)

preprocessor = build_feature_pipeline(X_train, config=config)


INFO | feature_engineering | Split data: X_train=(2344, 79), X_test=(586, 79), y_train=(2344,), y_test=(586,)
INFO | feature_engineering | Dected 36 numeric columns, 43 categorical columns


### Build regression model pipeline + evaluate

In [18]:
reg_model = Ridge(alpha=1.0)

reg_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", reg_model)
])

reg_pipeline.fit(X_train, y_train)
preds = reg_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print("Ames Housing (Regression)")
print("MAE:", round(mae, 2))
print("R2 :", round(r2, 4))

Ames Housing (Regression)
MAE: 16186.54
R2 : 0.8959


### Load Telco churn

In [19]:
# Place Telco CSV in: data/Telco-Customer-Churn.csv
TELCO_PATH = "data/Telco-Customer-Churn.csv"
df_telco = pd.read_csv(TELCO_PATH)
df_telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Clean target + split

In [20]:
# Target is "Churn" with values Yes/No
# Convert to 1/0 for logistic regression
df_telco["Churn"] = df_telco["Churn"].map({"Yes": 1, "No": 0})

# Many versions have "customerID" identifier; drop it if present
drop_cols = ["customerID"] if "customerID" in df_telco.columns else None

X_train, X_test, y_train, y_test = split_data(
    df_telco,
    target="Churn",
    test_size=0.2,
    random_state=42,
    stratify=True,
    drop_cols=drop_cols
)

# Some telco datasets store numeric values as strings (e.g., TotalCharges).
# Convert any "object" columns that are actually numeric.
for col in X_train.columns:
    if X_train[col].dtype == "object":
        try:
            X_train[col] = pd.to_numeric(X_train[col])
            X_test[col] = pd.to_numeric(X_test[col])
        except (ValueError, TypeError):
            # Column is truly categorical; leave as-is
            pass

preprocessor = build_feature_pipeline(X_train, config=config)

INFO | feature_engineering | Split data: X_train=(5634, 19), X_test=(1409, 19), y_train=(5634,), y_test=(1409,)
INFO | feature_engineering | Dected 3 numeric columns, 16 categorical columns


### Build classification pipeline + evaluate

In [21]:
clf_model = LogisticRegression(max_iter=2000)

clf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", clf_model)
])

clf_pipeline.fit(X_train, y_train)
preds = clf_pipeline.predict(X_test)

print("Telco Churn (Classification)")
print("Accuracy:", round(accuracy_score(y_test, preds), 4))
print(classification_report(y_test, preds))

Telco Churn (Classification)
Accuracy: 0.7935
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1035
           1       0.63      0.54      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409

