In [1]:
# Isme jo code hai ousko vs code me likha hai taki baad me ousko github pr push kr sake aur live kr sake  

# Data Preprocessing - Final Pipeline

In this section, we will consolidate everything we’ve done so far into one final
script using Scikit-Learn pipelines. This includes

1. <b>Creating</b> a stratified test set
2. <b>Handling</b> missing values
3. <b>Encoding</b> categorical variables
4. <b>Scaling</b> numerical features
5. <b>Combining</b> everything using Pipeline and ColumnTransformer

This will ensure <b>clean, modular, and reproducible code</b> — perfect for production
and education

# Final Preprocessing Code using Scikit-Learn Pipelines

In [2]:
import pandas as pd                              # Import Pandas for data handling
import numpy as np                               # Import NumPy for numerical operations

from sklearn.model_selection import StratifiedShuffleSplit  # For stratified train-test split
from sklearn.pipeline import Pipeline            # To chain preprocessing steps
from sklearn.compose import ColumnTransformer    # To apply pipelines to specific columns
from sklearn.impute import SimpleImputer         # To handle missing values
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # Scaling & encoding tools
# from sklearn.preprocessing import OrdinalEncoder  # Uncomment if ordinal encoding is preferred

# 1. Load the dataset
housing = pd.read_csv("housing.csv")              # Load housing dataset from CSV file

# 2. Create income categories (for stratified sampling)

housing["income_cat"] = pd.cut(                   # Create income categories from median_income
    housing["median_income"],                     # Column to categorize
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],          # Define income ranges
    labels=[1, 2, 3, 4, 5]                         # Assign category labels
)

# 3. Stratified train-test split

split = StratifiedShuffleSplit(                   # Create stratified splitter
    n_splits=1,                                   # Only one split
    test_size=0.2,                                # 20% data for test set
    random_state=42                               # Fix randomness
)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index].drop("income_cat", axis=1)  # Training data
    strat_test_set = housing.loc[test_index].drop("income_cat", axis=1)    # Test data

# 4. Work on training data only
housing = strat_train_set.copy()                  # Create a copy to avoid modifying original data

# 5. Separate features and labels
housing_labels = housing["median_house_value"].copy()  # Target variable (labels)
housing = housing.drop("median_house_value", axis=1)  # Remove label from features

# 6. Separate numerical & categorical columns

num_attribs = housing.drop("ocean_proximity", axis=1).columns.tolist()  # List of numerical feature names (Numerical Attributes)
cat_attribs = ["ocean_proximity"]                 # List of categorical feature names (Categorical Attributes)

# 7. Numerical pipeline

num_pipeline = Pipeline([                         # Pipeline for numerical features
    ("imputer", SimpleImputer(strategy="median")),# Fill missing values with median
    ("scaler", StandardScaler()),                 # Standardize features (mean=0, std=1)
])

# 8. Categorical pipeline

cat_pipeline = Pipeline([                         # Pipeline for categorical features
    # ("ordinal", OrdinalEncoder())               # Optional ordinal encoding
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  # One-Hot Encoding
])

# 9. Combine pipelines using ColumnTransformer

full_pipeline = ColumnTransformer([               # Apply different pipelines to columns
    ("num", num_pipeline, num_attribs),            # Numerical columns → num_pipeline
    ("cat", cat_pipeline, cat_attribs),            # Categorical columns → cat_pipeline
])

# 10. Apply full pipeline

housing_prepared = full_pipeline.fit_transform(housing)  # Fit preprocessing steps and transform the data

# 11. Check final data shape
print(housing_prepared.shape)                     # Show shape of prepared dataset


(16512, 13)


# Training and Evaluating ML Models

###### Now that our data is preprocessed, let’s move on to training machine learning models and evaluating their performance. We’ll start with:

- <b>Linear Regression
- <b>Decision Tree Regressor
- <b>Random Forest Regressor

We’ll first test them on the training data and then use <b>cross-validation</b> to get a
better estimate of their true performance

# 1. Train and Test Models on the Training Set

In [3]:
# Imporing ML Models
from sklearn.linear_model import LinearRegression          # Import Linear Regression model
from sklearn.tree import DecisionTreeRegressor             # Import Decision Tree Regressor
from sklearn.ensemble import RandomForestRegressor         # Import Random Forest Regressor
from sklearn.metrics import root_mean_squared_error        # Import metric to evaluate model

# 1. Linear Regression model
lin_reg = LinearRegression()                               # Create Linear Regression object
lin_reg.fit(housing_prepared, housing_labels)              # Train model on prepared data

# 2. Decision Tree model
tree_reg = DecisionTreeRegressor(random_state=42)          # Create Decision Tree with fixed randomness
tree_reg.fit(housing_prepared, housing_labels)             # Train Decision Tree model

# 3. Random Forest model
forest_reg = RandomForestRegressor(random_state=42)        # Create Random Forest model
forest_reg.fit(housing_prepared, housing_labels)           # Train Random Forest model

# 4. Make predictions on training data
lin_preds = lin_reg.predict(housing_prepared)              # Predictions by Linear Regression
tree_preds = tree_reg.predict(housing_prepared)            # Predictions by Decision Tree
forest_preds = forest_reg.predict(housing_prepared)        # Predictions by Random Forest

# 5. Calculate RMSE (Root Mean Squared Error)
lin_rmse = root_mean_squared_error(housing_labels, lin_preds) # RMSE for Linear Regression

tree_rmse = root_mean_squared_error(housing_labels, tree_preds) # RMSE for Decision Tree

forest_rmse = root_mean_squared_error(housing_labels, forest_preds) # RMSE for Random Forest

# 6. Print model performance
print("Linear Regression RMSE:", lin_rmse)                  # Show Linear Regression error
print("Decision Tree RMSE:", tree_rmse)                     # Show Decision Tree error
print("Random Forest RMSE:", forest_rmse)                   # Show Random Forest error


Linear Regression RMSE: 69050.56219504567
Decision Tree RMSE: 0.0
Random Forest RMSE: 18342.366362322846


# <b>NOTE:</b> A Warning About Training RMSE

Training RMSE <b>only shows how well the model fits the training data.</b> It does not
tell us how well it will perform on unseen data. In fact,<b> the Decision Tree and
Random Forest may overfit </b>, leading to very low training error but poor
generalization.

# 2. Cross-Validation: A Better Evaluation Strategy

<b>Cross-validation</b> helps us evaluate how a model generalizes to new data without
needing to touch the test set.

### What is Cross-Validation?

 Instead of training the model once and evaluating on a holdout set,<b> k-fold cross-validation </b> splits the training data into k folds (typically 10), trains the model on k-1 folds, and validates it on the remaining fold. This process repeats k times

- We’ll use cross_val_score from sklearn.model_selection .

## Cross-Validation on Decision Tree

In [4]:
from sklearn.model_selection import cross_val_score      # Import cross-validation utility
import pandas as pd                                      # Import Pandas for data analysis

# 1. Evaluate Decision Tree using Cross-Validation
tree_rmses = -cross_val_score(                           # Perform cross-validation and get RMSE
    tree_reg,                                            # Decision Tree model
    housing_prepared,                                    # Prepared feature data
    housing_labels,                                      # Target values
    scoring="neg_root_mean_squared_error",               # Negative RMSE (scikit-learn convention)
    cv=10                                                # Use 10-fold cross-validation
)

<b> ⚠ Important Note (VERY IMPORTANT) </b>
Scikit-Learn uses **utility scores** where higher is better.
Since RMSE is an error (lower is better), it is returned as **negative**.
We multiply by `-1` to convert it back to positive RMSE.

In [5]:
# 2.Print individual RMSE scores
print("Decision Tree CV RMSEs:", tree_rmses)              # RMSE for each fold

# 3.Show statistical summary of cross-validation results
print("\nCross-Validation Performance (Decision Tree):")
print(pd.Series(tree_rmses).describe())                   # Mean, std, min, max RMSE


Decision Tree CV RMSEs: [71177.6601991  69770.07865373 64770.5639395  68536.60203993
 67057.08155801 68847.12456973 70977.38255647 69208.86346929
 67187.87131535 73280.38732407]

Cross-Validation Performance (Decision Tree):
count       10.000000
mean     69081.361563
std       2420.500173
min      64770.563939
25%      67525.053996
50%      69027.994020
75%      70675.556581
max      73280.387324
dtype: float64


# Check this  ⚠

In [6]:
# 1. Evaluate Random Forest mode using Cross-Validation
forest_rmse = -cross_val_score(                           # Perform cross-validation and get RMSE
    forest_reg,                                            # Decision Tree model
    housing_prepared,                                    # Prepared feature data
    housing_labels,                                      # Target values
    scoring="neg_root_mean_squared_error",               # Negative RMSE (scikit-learn convention)
    cv=10                                                # Use 10-fold cross-validation
)

# 2.Print individual RMSE scores
print("Random Forest CV RMSEs:", tree_rmses)              # RMSE for each fold

# 3.Show statistical summary of cross-validation results
print("\nCross-Validation Performance (Random Forest):")
print(pd.Series(tree_rmses).describe())                   # Mean, std, min, max RMSE


Random Forest CV RMSEs: [71177.6601991  69770.07865373 64770.5639395  68536.60203993
 67057.08155801 68847.12456973 70977.38255647 69208.86346929
 67187.87131535 73280.38732407]

Cross-Validation Performance (Random Forest):
count       10.000000
mean     69081.361563
std       2420.500173
min      64770.563939
25%      67525.053996
50%      69027.994020
75%      70675.556581
max      73280.387324
dtype: float64
