In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer,MinMaxScaler

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

In [4]:
from sklearn.preprocessing import OneHotEncoder

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
from sklearn.compose import ColumnTransformer

In [7]:
from sklearn.pipeline import Pipeline, make_pipeline

In [8]:
file_path = '/Users/xwyang/Desktop/data/housing.csv'

In [9]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing_num = df.select_dtypes(include='number')
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [11]:
from sklearn.compose import make_column_selector,make_column_transformer

In [12]:
num_pipeline = Pipeline([('impute',SimpleImputer(strategy='median')),('std',StandardScaler())])

In [13]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore'))

In [14]:
processing = ColumnTransformer([
    ('num',num_pipeline,make_column_selector(dtype_include='number')),
    ('cat',cat_pipeline,make_column_selector(dtype_include='object'))
])

In [15]:
housing_prepared = processing.fit_transform(df)

In [16]:
housing_prepared[:1]

array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.97247648,
        -0.9744286 , -0.97703285,  2.34476576,  2.12963148,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ]])

In [17]:
process = make_column_transformer(
    (num_pipeline,make_column_selector(dtype_include='number')),
    (cat_pipeline,make_column_selector(dtype_include='object'))
)

In [18]:
process.fit_transform(df)[:1]

array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.97247648,
        -0.9744286 , -0.97703285,  2.34476576,  2.12963148,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ]])

In [19]:
pd.get_dummies(df['ocean_proximity'],dtype='int').value_counts()

<1H OCEAN  INLAND  ISLAND  NEAR BAY  NEAR OCEAN
1          0       0       0         0             9136
0          1       0       0         0             6551
           0       0       0         1             2658
                           1         0             2290
                   1       0         0                5
Name: count, dtype: int64

In [20]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

 you can definitely use `Pipeline` again to create a logical flow that combines your `ColumnTransformer` (which handles preprocessing) with a machine learning model like `LinearRegression`. This is a common practice in machine learning workflows, where you chain together preprocessing steps and a model into a single pipeline.

Here’s how you can do it:

```python
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer,make_column_selector,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
--------------------------------------------------------------------------------------------------------
# Define the numerical pipeline
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('std', StandardScaler())
])

#(or usiing make_pipeline to define the numerical pipeline, you don't care about naming the transformers)

num_pipe = make_pipeline(
    (SimpleImputer(strategy='median'), StandardScaler())

# Define the categorical pipeline
cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
-----------------------------------------------------------------------------------------------------
# Combine the numerical and categorical pipelines using ColumnTransformer
processing = ColumnTransformer([
    ('num', num_pipeline, make_column_selector(dtype_include='number')),
    ('cat', cat_pipeline, make_column_selector(dtype_include='object'))
])


#(or Combine the numerical and categorical pipelines using make_column_transformer,This way, you do not care about namiing the transformers.)
processing_new = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include='number')),
    (cat_pipeline, make_column_selector(dtype_include='object'))
) 

----------------------------------------------------------------------------------------------------
# Now we're ready to apply with ColumnTransformer to the housing data:
 housing_prepared = processing.fit_transform(housing)
# housing_prepared is a NumPy array, but you can get the column names using this:
 processing.get_feature_names_out()
 df_housing_prepared = pd.DataFrame(housing_prepared,column=processing.get_feature_names_out(),
                                    index=housing.index)
---------------------------------------------------------------------------------------------------
# Create the final pipeline that includes preprocessing and the model
final_pipeline = Pipeline([
    ('preprocessing', processing),
    ('regressor', LinearRegression())
])

# Example usage:
# Assuming you have a DataFrame `X` and target `y`
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
final_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = final_pipeline.predict(X_test)
```
------------------------------------------------------------------------------------------------------
### Explanation:
1. **Numerical Pipeline (`num_pipeline`)**:
   - `SimpleImputer` fills missing values with the median.
   - `StandardScaler` standardizes the numerical features.

2. **Categorical Pipeline (`cat_pipeline`)**:
   - `SimpleImputer` fills missing values with the most frequent value.
   - `OneHotEncoder` converts categorical variables into one-hot encoded vectors.

3. **ColumnTransformer (`processing`)**:
   - Applies the numerical pipeline to numerical columns and the categorical pipeline to categorical columns.

4. **Final Pipeline (`final_pipeline`)**:
   - Combines the preprocessing (`processing`) with a `LinearRegression` model.
   - This pipeline can now be used to fit and predict on your data.

### Benefits:
- **Encapsulation**: All preprocessing and modeling steps are encapsulated in a single object.
- **Reproducibility**: The pipeline ensures that the same preprocessing steps are applied during training and prediction.
- **Convenience**: You can use `final_pipeline.fit()` and `final_pipeline.predict()` directly, without manually handling preprocessing.

This approach is highly modular and can be extended to include additional preprocessing steps or different models as needed.




In [21]:
df_housing_prepared = pd.DataFrame(housing_prepared,columns=processing.get_feature_names_out(),
                                    index=df.index)

In [22]:
df_housing_prepared.head()

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,num__median_house_value,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,0.0,0.0,0.0,1.0,0.0


In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [24]:
lin_reg = make_pipeline(processing,LinearRegression())

In [25]:
df["income_cat"] = pd.cut(df["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [26]:
strat_train_set, strat_test_set = train_test_split(
    df, test_size=0.2, stratify=df["income_cat"], random_state=42)

In [27]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [28]:
lin_reg.fit(housing,housing_labels)

In [29]:
housing_pred = lin_reg.predict(housing)

In [30]:
housing_labels.iloc[:5].values

array([ 72100., 279600.,  82700., 112500., 238300.])

In [31]:
housing_pred[:5].round(-2)

array([ 89000., 305400., 153300., 184300., 246800.])

In [32]:
from sklearn.metrics import mean_squared_error

In [33]:
lin_rmse = mean_squared_error(housing_pred,housing_labels,squared=False)

In [34]:
lin_rmse

69050.56219504567

In [35]:
mean_squared_error(housing_pred,housing_labels)

4767980139.451871

In [36]:
tree_reg = make_pipeline(processing,DecisionTreeRegressor(random_state=42))

In [37]:
tree_reg.fit(housing, housing_labels)

In [38]:
house_pred = tree_reg.predict(housing)

In [39]:
tree_rmse = mean_squared_error(house_pred,housing_labels,squared=False)

In [40]:
tree_rmse

0.0

In [41]:
tree_rmse = -cross_val_score(tree_reg,housing,housing_labels,cv=10,scoring='neg_root_mean_squared_error')

In [42]:
tree_rmse

array([71152.59187256, 70068.70314956, 64901.96987894, 68783.02168675,
       66944.32525877, 68260.00359756, 71031.16291424, 69086.59778928,
       67252.39727946, 73247.28194339])

In [43]:
pd.Series(tree_rmse).describe()

count       10.000000
mean     69072.805537
std       2416.695095
min      64901.969879
25%      67504.298859
50%      68934.809738
75%      70790.547973
max      73247.281943
dtype: float64

In [44]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
forest_reg = make_pipeline(processing, RandomForestRegressor(random_state=42))
forest_rmse = -cross_val_score(forest_reg,housing,housing_labels,
                               cv=10,n_jobs=-1,scoring='neg_root_mean_squared_error')

In [48]:
pd.Series(forest_rmse).describe()

count       10.000000
mean     49421.728261
std       2226.040416
min      45620.574144
25%      47823.313566
50%      49340.841859
75%      50603.884558
max      53028.734995
dtype: float64

In [49]:
full_pipeline=Pipeline([('process',processing),('random_forest',RandomForestRegressor(random_state=42))])

In [50]:
params = {'random_forest__max_features':[4,6,8],'random_forest__n_estimators':[100,200,300,400]}
grid_reg = GridSearchCV(full_pipeline, params,cv=3,scoring='neg_root_mean_squared_error')

In [51]:
grid_reg.fit(housing,housing_labels)

In [52]:
grid_reg.best_estimator_

In [53]:
grid_reg.best_params_

{'random_forest__max_features': 6, 'random_forest__n_estimators': 400}

In [54]:
grid_reg.best_score_

-49571.387963150664

In [55]:
cv_res=pd.DataFrame(grid_reg.cv_results_)

In [56]:
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_features,param_random_forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,4.083745,0.142964,0.156906,0.002269,4,100,"{'random_forest__max_features': 4, 'random_for...",-49247.460732,-50569.309058,-51100.995638,-50305.921809,779.284948,12
1,9.412016,1.417282,0.302701,0.000119,4,200,"{'random_forest__max_features': 4, 'random_for...",-48905.381675,-50283.354253,-50818.876765,-50002.537564,806.0229,11
2,13.151227,1.199525,0.49331,0.012217,4,300,"{'random_forest__max_features': 4, 'random_for...",-48802.54073,-50310.719366,-50857.718972,-49990.326356,869.071597,9
3,16.42923,0.597792,0.609669,0.018077,4,400,"{'random_forest__max_features': 4, 'random_for...",-48773.566782,-50279.652374,-50822.137954,-49958.45237,866.617195,7
4,5.653119,0.196702,0.153174,0.00097,6,100,"{'random_forest__max_features': 6, 'random_for...",-48945.698746,-50527.968024,-50516.984537,-49996.883769,743.313583,10


In [58]:
cv_res.sort_values(by='mean_test_score',ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_features,param_random_forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,24.831641,2.644671,0.611023,0.021712,6,400,"{'random_forest__max_features': 6, 'random_for...",-48548.73052,-49886.256948,-50279.176422,-49571.387963,740.705795,1
6,18.937341,2.013323,0.47698,0.043338,6,300,"{'random_forest__max_features': 6, 'random_for...",-48616.53996,-49843.44681,-50490.18094,-49650.055903,777.03825,2
5,12.324261,0.541031,0.305363,0.012755,6,200,"{'random_forest__max_features': 6, 'random_for...",-48516.579322,-49996.134862,-50439.812565,-49650.842249,822.243584,3
11,28.402927,0.84811,0.585569,0.002596,8,400,"{'random_forest__max_features': 8, 'random_for...",-48839.728006,-49783.195362,-50473.751556,-49698.891641,669.745513,4
10,21.859124,1.231012,0.443412,0.001355,8,300,"{'random_forest__max_features': 8, 'random_for...",-48856.984536,-49958.561312,-50376.004196,-49730.516681,640.759158,5


In [59]:
from sklearn.model_selection import RandomizedSearchCV

In [64]:
random_reg = RandomizedSearchCV(full_pipeline, params,n_iter=10,cv=3,
                                scoring='neg_root_mean_squared_error',random_state=42)

In [None]:
random_reg.fit(housing,housing_labels)