In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer,MinMaxScaler

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

In [4]:
from sklearn.preprocessing import OneHotEncoder

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
from sklearn.compose import ColumnTransformer

In [7]:
from sklearn.pipeline import Pipeline, make_pipeline

In [8]:
file_path = '/Users/xwyang/Desktop/data/housing.csv'

In [9]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing_num = df.select_dtypes(include='number')
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [11]:
from sklearn.compose import make_column_selector,make_column_transformer

In [13]:
num_pipeline = Pipeline([('impute',SimpleImputer(strategy='median')),('std',StandardScaler())])

In [14]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore'))

In [15]:
processing = ColumnTransformer([
    ('num',num_pipeline,make_column_selector(dtype_include='number')),
    ('cat',cat_pipeline,make_column_selector(dtype_include='object'))
])

In [16]:
housing_prepared = processing.fit_transform(df)

In [19]:
housing_prepared[:1]

array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.97247648,
        -0.9744286 , -0.97703285,  2.34476576,  2.12963148,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ]])

In [20]:
process = make_column_transformer(
    (num_pipeline,make_column_selector(dtype_include='number')),
    (cat_pipeline,make_column_selector(dtype_include='object'))
)

In [22]:
process.fit_transform(df)[:1]

array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.97247648,
        -0.9744286 , -0.97703285,  2.34476576,  2.12963148,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ]])

In [25]:
pd.get_dummies(df['ocean_proximity'],dtype='int').value_counts()

<1H OCEAN  INLAND  ISLAND  NEAR BAY  NEAR OCEAN
1          0       0       0         0             9136
0          1       0       0         0             6551
           0       0       0         1             2658
                           1         0             2290
                   1       0         0                5
Name: count, dtype: int64

In [26]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

 you can definitely use `Pipeline` again to create a logical flow that combines your `ColumnTransformer` (which handles preprocessing) with a machine learning model like `LinearRegression`. This is a common practice in machine learning workflows, where you chain together preprocessing steps and a model into a single pipeline.

Here’s how you can do it:

```python
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer,make_column_selector,make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
--------------------------------------------------------------------------------------------------------
# Define the numerical pipeline
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('std', StandardScaler())
])

#(or usiing make_pipeline to define the numerical pipeline, you don't care about naming the transformers)

num_pipe = make_pipeline(
    (SimpleImputer(strategy='median'), StandardScaler())

# Define the categorical pipeline
cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
-----------------------------------------------------------------------------------------------------
# Combine the numerical and categorical pipelines using ColumnTransformer
processing = ColumnTransformer([
    ('num', num_pipeline, make_column_selector(dtype_include='number')),
    ('cat', cat_pipeline, make_column_selector(dtype_include='object'))
])


#(or Combine the numerical and categorical pipelines using make_column_transformer,This way, you do not care about namiing the transformers.)
processing_new = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include='number')),
    (cat_pipeline, make_column_selector(dtype_include='object'))
) 

----------------------------------------------------------------------------------------------------
# Now we're ready to apply with ColumnTransformer to the housing data:
 housing_prepared = processing.fit_transform(housing)
# housing_prepared is a NumPy array, but you can get the column names using this:
 processing.get_feature_names_out()
 df_housing_prepared = pd.DataFrame(housing_prepared,column=processing.get_feature_names_out(),
                                    index=housing.index)
---------------------------------------------------------------------------------------------------
# Create the final pipeline that includes preprocessing and the model
final_pipeline = Pipeline([
    ('preprocessing', processing),
    ('regressor', LinearRegression())
])

# Example usage:
# Assuming you have a DataFrame `X` and target `y`
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
final_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = final_pipeline.predict(X_test)
```
------------------------------------------------------------------------------------------------------
### Explanation:
1. **Numerical Pipeline (`num_pipeline`)**:
   - `SimpleImputer` fills missing values with the median.
   - `StandardScaler` standardizes the numerical features.

2. **Categorical Pipeline (`cat_pipeline`)**:
   - `SimpleImputer` fills missing values with the most frequent value.
   - `OneHotEncoder` converts categorical variables into one-hot encoded vectors.

3. **ColumnTransformer (`processing`)**:
   - Applies the numerical pipeline to numerical columns and the categorical pipeline to categorical columns.

4. **Final Pipeline (`final_pipeline`)**:
   - Combines the preprocessing (`processing`) with a `LinearRegression` model.
   - This pipeline can now be used to fit and predict on your data.

### Benefits:
- **Encapsulation**: All preprocessing and modeling steps are encapsulated in a single object.
- **Reproducibility**: The pipeline ensures that the same preprocessing steps are applied during training and prediction.
- **Convenience**: You can use `final_pipeline.fit()` and `final_pipeline.predict()` directly, without manually handling preprocessing.

This approach is highly modular and can be extended to include additional preprocessing steps or different models as needed.




In [33]:
df_housing_prepared = pd.DataFrame(housing_prepared,columns=processing.get_feature_names_out(),
                                    index=df.index)

In [34]:
df_housing_prepared.head()

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,num__median_house_value,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,0.0,0.0,0.0,1.0,0.0
