In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_House_Price = r"housing.csv"

In [3]:
houses = pd.read_csv(filepath_House_Price , header = 0 )

In [4]:
houses.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
houses.shape

(20640, 10)

## STEP 2 : Package importing 

In [6]:
from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler

# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

## STEP 3 : Pipeline

In [7]:
df = houses.copy()

In [8]:
def num_cat_col_list(DF) : 
    
    from pandas.api.types import is_string_dtype, is_numeric_dtype

    
    num_list = []
    cat_list = []

    for column in DF:

        if is_numeric_dtype(DF[column]):
            num_list.append(column)

        elif is_string_dtype(DF[column]):
            cat_list.append(column)

    return (num_list , cat_list)

In [9]:
num_list , cat_list = num_cat_col_list(DF = df)

print("Numerical column : ",num_list)
print()
print("Categorical column : ",cat_list)

Numerical column :  ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']

Categorical column :  ['ocean_proximity']


### Making changes to num_feat and cat_feat list

In [10]:
# num_list

num_feat = num_list.copy()
num_feat.remove('median_house_value')

cat_feat = cat_list.copy()

In [11]:
def pipeline_trans(DF,num_feat ,cat_feat):
    
    # Separating Independent and dependent variables
    X  = DF.drop(labels = ['median_house_value' ], axis = 1)
    y  = DF['median_house_value']
    
    # Train-Test Split
    X_train,X_test,y_train,y_test = train_test_split(X,y , random_state= 100 ,test_size = 0.2 )
    
    # Pipeline
    ohe = OneHotEncoder()
    
    num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median'))                       
                            ])
    
    cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])
    
    pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                                ("Nominal pipeline" , cat_pipeline , cat_feat)
                                ])
    
    # Fit and Transformation
    X_train_tr = pipeline.fit_transform(X_train)
    X_test_tr =  pipeline.transform(X_test)
    
    # Output Columns after tranformation
    out_cols = []

    i= 0  # counter

    for col in pipeline.get_feature_names_out():
        out_cols.append(col.split("__")[-1])
        #out_cols[i] = out_cols[i].split("_")[-1]

        i += 1
    # print(out_cols)
    
    # Conversion to train and test Dataframes
    X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)
    X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)
    
    y_train.reset_index(drop = True , inplace = True)
    y_test.reset_index(drop = True , inplace = True)
    
    df_train_trans = pd.concat([X_train_tr_df,y_train] ,axis = 1 )
    df_test_trans = pd.concat([X_test_tr_df,y_test] ,axis = 1 )
    
    return df_train_trans,df_test_trans

In [12]:
df_train_tr ,df_test_tr = pipeline_trans(DF = df , num_feat = num_feat , cat_feat = cat_feat)

In [13]:
df_train_tr.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-122.7,39.14,13.0,532.0,111.0,214.0,62.0,3.3929,0.0,1.0,0.0,0.0,0.0,108300.0
1,-120.83,35.32,11.0,3252.0,701.0,1814.0,660.0,3.2226,0.0,0.0,0.0,0.0,1.0,183200.0
2,-122.31,40.49,18.0,4026.0,718.0,1731.0,705.0,3.35,0.0,1.0,0.0,0.0,0.0,118400.0


In [14]:
df_test_tr.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_house_value
0,-118.1,33.81,36.0,1111.0,184.0,444.0,177.0,3.7031,1.0,0.0,0.0,0.0,0.0,245300.0
1,-122.27,37.82,52.0,1630.0,456.0,1162.0,400.0,1.2475,0.0,0.0,0.0,1.0,0.0,104200.0
2,-119.12,35.37,13.0,4527.0,713.0,2170.0,671.0,4.8266,0.0,1.0,0.0,0.0,0.0,146200.0


## STEP 4 : Export the DF as a .csv file.

In [15]:
D_train = df_train_tr.copy()
D_test = df_test_tr.copy()

In [16]:
D_train.to_csv(r"S1_Part2_Pipeline_Train_DS.csv")
D_test.to_csv(r"S1_Part2_Pipeline_Test_DS.csv")