# Week 5 — ML Pipeline (Team Notebook)
End-to-end ML workflow: preprocessing → modeling → tuning → best accuracy.


In [12]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
from sklearn.preprocessing import FunctionTransformer
import numpy as np


In [2]:
df = pd.read_csv("final_internship_data.csv")
print(df.shape, "rows, cols")
df.head()


(500000, 26) rows, cols


Unnamed: 0,User ID,User Name,Driver Name,Car Condition,Weather,Traffic Condition,key,fare_amount,pickup_datetime,pickup_longitude,...,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,KHVrEVlD,Kimberly Adams,Amy Butler,Very Good,windy,Congested Traffic,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-1.288826,...,6,0,2009,20.26584,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897
1,lPxIuEri,Justin Tapia,Hannah Zimmerman,Excellent,cloudy,Flow Traffic,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-1.291824,...,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217
2,gsVN8JLS,Elizabeth Lopez,Amanda Jackson,Bad,stormy,Congested Traffic,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-1.291242,...,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961
3,9I7kWFgd,Steven Wilson,Amy Horn,Very Good,stormy,Flow Traffic,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-1.291319,...,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.79927,0.133905
4,8QN5ZaGN,Alexander Andrews,Cassandra Larson,Bad,stormy,Congested Traffic,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-1.290987,...,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703


In [3]:
#for speed training
MAX_ROWS = 10000 
if len(df) > MAX_ROWS:
    df = df.sample(n=MAX_ROWS, random_state=42)
print("Data after sampling:", df.shape)


Data after sampling: (10000, 26)


In [4]:
TARGET=None
def infer_target(df: pd.DataFrame):
    for c in ["target","label","class","y"]:
        if c in df.columns: return c
    for c in df.columns[::-1]:
        if df[c].dtype == 'object' and df[c].nunique(dropna=True) <= 50:
            return c
    for c in df.columns[::-1]:
        if np.issubdtype(df[c].dtype, np.number) and df[c].nunique() <= 10:
            return c
    return None

if TARGET is None:
    TARGET = infer_target(df)
print("TARGET:", TARGET)
assert TARGET is not None, "Please set TARGET column name above."


TARGET: Traffic Condition


In [5]:
# Drop rows where target is NaN (ensures models don't fail on missing labels)
before = len(df)
df = df.dropna(subset=[TARGET])
after = len(df)
print(f"Dropped {before - after} rows with NaN in target.")


Dropped 0 rows with NaN in target.


In [6]:
# Drop rows where target is NaN
before = len(df)
df = df.dropna(subset=[TARGET])
after = len(df)
print(f"Dropped {before - after} rows with NaN in target.")


Dropped 0 rows with NaN in target.


In [7]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y if y.nunique() < 50 else None
)
X_train.shape, X_test.shape


((8000, 25), (2000, 25))

In [8]:
from sklearn.impute import SimpleImputer

# Identify column types on the training data
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
print("Numeric:", len(num_cols), "| Categorical:", len(cat_cols))

numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))  # keep sparse compatibility
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
], remainder="drop")


Numeric: 18 | Categorical: 7


In [9]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Split (recompute here to ensure order is correct in notebook)
X = df.drop(columns=[TARGET])
y = df[TARGET]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y if y.nunique() < 50 else None
)

# Identify columns on training split
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
print("Numeric:", len(num_cols), "| Categorical:", len(cat_cols))

# Robust imputers: constant strategy handles even all-NaN columns
numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
    ("scaler", StandardScaler(with_mean=False))
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
], remainder="drop")

# Sanity check: after fitting the preprocessor, transformed X should have no NaN/Inf
Xt = preprocessor.fit_transform(X_train)
import scipy
if scipy.sparse.issparse(Xt):
    # Sparse matrices won't store NaNs; still check data array if CSR/CSC
    pass
else:
    assert np.isfinite(Xt).all(), "Preprocessor produced non-finite values."
print("Preprocessor ready. No NaNs will reach the estimator.")


Numeric: 18 | Categorical: 7
Preprocessor ready. No NaNs will reach the estimator.


In [10]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier

N_JOBS = -1
MODELS = {
    "logreg": (
        LogisticRegression(max_iter=1000, class_weight="balanced"),
        {
            "clf__C": np.logspace(-2, 2, 10),
            "clf__penalty": ["l2"],
            "clf__solver": ["saga"]
        }
    ),
    "rf": (
        RandomForestClassifier(n_estimators=300, class_weight="balanced_subsample", 
                               n_jobs=N_JOBS, random_state=42),
        {
            "clf__n_estimators": [200, 300, 500],
            "clf__max_depth": [None, 10, 20, 40],
            "clf__min_samples_split": [2, 5, 10]
        }
    ),
    "gb": (
        GradientBoostingClassifier(random_state=42),
        {
            "clf__n_estimators": [100, 200],
            "clf__learning_rate": [0.05, 0.1, 0.2],
            "clf__max_depth": [2, 3]
        }
    )

    ,
    "hgb": (
        HistGradientBoostingClassifier(random_state=42),
        {
            "clf__learning_rate": [0.05, 0.1, 0.2],
            "clf__max_depth": [None, 6, 12],
            "clf__max_iter": [100, 200]
        }
    )
}


In [14]:
results = []
best_model = None
best_acc = -1

def to_dense(X):
    return X.toarray() if hasattr(X, "toarray") else X


for name, (estimator, param_grid) in MODELS.items():
    print("\n===", name, "===")

    if name == "hgb":
        pipe = Pipeline([
            ("prep", preprocessor),
            ("densify", FunctionTransformer(to_dense, accept_sparse=True)),
            ("clf", estimator)
        ])
    else:
        pipe = Pipeline([
            ("prep", preprocessor),
            ("clf", estimator)
        ])

    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=10,      
        scoring="accuracy",
        cv=3,           
        random_state=42,
        n_jobs=N_JOBS,
        verbose=1
    )
    search.fit(X_train, y_train)
    ...


pd.DataFrame(results).sort_values("accuracy", ascending=False)



=== logreg ===
Fitting 3 folds for each of 10 candidates, totalling 30 fits

=== rf ===
Fitting 3 folds for each of 10 candidates, totalling 30 fits

=== gb ===
Fitting 3 folds for each of 10 candidates, totalling 30 fits

=== hgb ===
Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyError: 'accuracy'

In [15]:
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
print(f"\nBEST MODEL ACCURACY: {acc:.4f} | Macro-F1: {f1:.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted'); plt.ylabel('True')
for (i,j), v in np.ndenumerate(cm):
    plt.text(j, i, str(v), ha='center', va='center')
plt.tight_layout(); plt.show()


AttributeError: 'NoneType' object has no attribute 'predict'

In [None]:
import joblib
joblib.dump(best_model, f"best_model_{TEAM_NAME}.joblib")
print("Saved:", f"best_model_{TEAM_NAME}.joblib")


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Winner picked from your printed results:
rf_best_params = {
    "n_estimators": 500,
    "min_samples_split": 10,
    "max_depth": None,
    "class_weight": "balanced_subsample",
    "n_jobs": -1,
    "random_state": 42,
}
best_model = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(**rf_best_params))
])


In [18]:
# Fit once on your 10k sample split
TEAM_NAME='task5_model'
best_model.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import accuracy_score, f1_score, classification_report
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred, average="macro")
print(f"FINAL (RandomForest)  Accuracy: {acc:.4f} | Macro-F1: {f1:.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Save for Django deployment
import joblib
joblib.dump(best_model, f"best_model_{TEAM_NAME}.joblib")
print("Saved:", f"best_model_{TEAM_NAME}.joblib")


FINAL (RandomForest)  Accuracy: 0.3450 | Macro-F1: 0.3385

Classification report:
                    precision    recall  f1-score   support

Congested Traffic       0.34      0.36      0.35       670
    Dense Traffic       0.37      0.44      0.40       675
     Flow Traffic       0.32      0.23      0.26       655

         accuracy                           0.34      2000
        macro avg       0.34      0.34      0.34      2000
     weighted avg       0.34      0.34      0.34      2000

Saved: best_model_task5_model.joblib
