# Machine Learning Essentials — Practical Notebook

Generated: 2025-09-02 02:01 UTC

In [None]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.inspection import permutation_importance
from joblib import dump
sns.set_theme(context='notebook', style='whitegrid', palette='deep')

## Generate synthetic dataset (toggle task)

In [None]:
from sklearn.datasets import make_classification, make_regression
TASK='classification'  # or 'regression'
if TASK=='classification':
    X, y = make_classification(n_samples=1500, n_features=10, n_informative=5, random_state=7)
    cols=[f'f{i}' for i in range(10)]; df = pd.DataFrame(X, columns=cols); df['label']=y; df['cat']=np.random.default_rng(7).choice(['A','B','C'], size=len(df)); TARGET='label'
else:
    X, y = make_regression(n_samples=1500, n_features=12, n_informative=6, noise=12.0, random_state=7)
    cols=[f'f{i}' for i in range(12)]; df = pd.DataFrame(X, columns=cols); df['target']=y; df['cat']=np.random.default_rng(7).choice(['A','B','C'], size=len(df)); TARGET='target'
# Or load your CSV: df = pd.read_csv('your.csv'); TARGET='your_target'
df.head()

## Preprocessing + model pipeline

In [None]:
X = df.drop(columns=[TARGET]); y = df[TARGET]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
num_pipe = Pipeline([('impute', SimpleImputer(strategy='median')), ('scale', StandardScaler())])
cat_pipe = Pipeline([('impute', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y if TASK=='classification' else None)
model = RandomForestClassifier(n_estimators=300, random_state=7) if TASK=='classification' else RandomForestRegressor(n_estimators=300, random_state=7)
pipe = Pipeline([('pre', pre), ('model', model)])
pipe.fit(Xtr, ytr)
yp = pipe.predict(Xte)

if TASK=='classification':
    yproba = pipe.predict_proba(Xte) if hasattr(pipe['model'], 'predict_proba') else None
    metrics = {'accuracy': accuracy_score(yte, yp), 'balanced_accuracy': balanced_accuracy_score(yte, yp), 'f1_weighted': f1_score(yte, yp, average='weighted')}
    if yproba is not None:
        try: metrics['roc_auc_ovr'] = roc_auc_score(yte, yproba, multi_class='ovr')
        except Exception: pass
    print(metrics); print('Confusion matrix:
', confusion_matrix(yte, yp))
else:
    print({'MAE': mean_absolute_error(yte, yp), 'RMSE': mean_squared_error(yte, yp, squared=False), 'R2': r2_score(yte, yp)})

## Cross-validation & tuning (quick)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=7) if TASK=='classification' else KFold(n_splits=5, shuffle=True, random_state=7)
scoring = 'f1_weighted' if TASK=='classification' else 'neg_root_mean_squared_error'
scores = cross_val_score(pipe, X, y, cv=cv, scoring=scoring)
print('CV', scoring, '->', scores.mean(), '+/-', scores.std())

param_grid = {'model__n_estimators':[200,400], 'model__max_depth':[None,10,20]}
search = GridSearchCV(pipe, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1)
search.fit(X, y)
print('Best:', search.best_params_, 'score:', search.best_score_)

## Permutation importance & save model

In [None]:
r = permutation_importance(pipe, Xte, yte, n_repeats=5, random_state=7)
imp = pd.DataFrame({'feature': pipe['pre'].get_feature_names_out(), 'importance': r.importances_mean})
print(imp.sort_values('importance', ascending=False).head(20))

dump(pipe, 'model_quickstart.pkl'); 'Saved model_quickstart.pkl'