In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_House_Price = r"C:\Users\aakas\PythonStuff\Regression_udemy\resources\Linear_Regression\House_Price.csv"


In [3]:
houses = pd.read_csv(filepath_House_Price , header = 0 )

In [4]:
houses.head(2)

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,4.99,4.7,5.12,5.06,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146


## STEP 2  : Feature Merging

In [5]:
# STEP 5 : Custom Transformer

In [6]:
# from sklearn.base import TransformerMixin, BaseEstimator

    class FeatureMerger(BaseEstimator, TransformerMixin):

        def __init__(self):

            self.in_column = ['dist1', 'dist2', 'dist3', 'dist4']
            self.out_column = 'dist'

        def fit(self, X, y=None):

            # X -> Data Frame 
            return self

        def transform(self, X, y=None):

            X[self.out_column] = X[self.in_column].mean(axis = 1)

            X.drop(labels = self.in_column , axis = 1 , inplace = True)

            return X

        def fit_transform(self, X, y=None):

            self.fit(X, y) # parameter 

            return self.transform(X, y) # act to transform

In [7]:
def FeatureMerger(DF , inp_col , out_col):
    
    inp_columns = inp_col
    out_columns = out_col
    
    DF[out_columns] = DF[inp_columns].mean(axis = 1)
        
    DF.drop(labels = inp_columns , axis = 1 , inplace = True)
        
    return DF
    

In [8]:
df = houses.copy()

inpcol = ['dist1', 'dist2', 'dist3', 'dist4']
outcol = 'dist'

df = FeatureMerger(DF = df ,inp_col = inpcol , out_col = outcol  )

In [9]:
df.head(2)

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks,dist
0,24.0,0.00632,32.31,0.538,6.575,65.2,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347,4.0875
1,21.6,0.02731,37.07,0.469,6.421,78.9,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146,4.9675


In [10]:
df.columns

Index(['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age',
       'teachers', 'poor_prop', 'airport', 'n_hos_beds', 'n_hot_rooms',
       'waterbody', 'rainfall', 'bus_ter', 'parks', 'dist'],
      dtype='object')

## STEP 3. Fetching numerical and categorical columns

In [11]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

num_list = []
cat_list = []

for column in df:
    
    if is_numeric_dtype(df[column]):
        num_list.append(column)
        
    elif is_string_dtype(df[column]):
        cat_list.append(column)



print('Numerical attributes : ',num_list,'\n')

print('Categorical attributes : ',cat_list)

Numerical attributes :  ['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'teachers', 'poor_prop', 'n_hos_beds', 'n_hot_rooms', 'rainfall', 'parks', 'dist'] 

Categorical attributes :  ['airport', 'waterbody', 'bus_ter']


## STEP 4 : Features and Target split

In [12]:
X = df.drop(labels = ['price'] ,axis = 1)
y = df['price']

## STEP 5 : Train - Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train ,X_test ,y_train ,y_test = train_test_split(X ,y ,test_size= 0.2 , random_state= 123)

In [15]:
X_train.shape ,y_train.shape

((404, 15), (404,))

In [16]:
X_test.shape , y_test.shape

((102, 15), (102,))

## STEP 6 : Pipeline

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline # sequential pipeline
from sklearn.compose import ColumnTransformer


In [18]:
df.columns

Index(['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age',
       'teachers', 'poor_prop', 'airport', 'n_hos_beds', 'n_hot_rooms',
       'waterbody', 'rainfall', 'bus_ter', 'parks', 'dist'],
      dtype='object')

In [19]:
# num_list

In [20]:
num_feat = num_list.copy()
num_feat

['price',
 'crime_rate',
 'resid_area',
 'air_qual',
 'room_num',
 'age',
 'teachers',
 'poor_prop',
 'n_hos_beds',
 'n_hot_rooms',
 'rainfall',
 'parks',
 'dist']

In [21]:
num_feat_drop = ['price' , 'n_hos_beds', 'n_hot_rooms','rainfall' ]

for col in num_feat_drop :
    
    num_feat.remove(col)
num_feat

['crime_rate',
 'resid_area',
 'air_qual',
 'room_num',
 'age',
 'teachers',
 'poor_prop',
 'parks',
 'dist']

In [22]:
cat_feat = cat_list.copy()
cat_feat

['airport', 'waterbody', 'bus_ter']

In [23]:
cat_feat.remove('bus_ter')

In [24]:
cat_feat

['airport', 'waterbody']

In [25]:
drop_cols = ['bus_ter', 'n_hos_beds', 'n_hot_rooms','rainfall']

In [26]:
X_train.columns

Index(['crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'teachers',
       'poor_prop', 'airport', 'n_hos_beds', 'n_hot_rooms', 'waterbody',
       'rainfall', 'bus_ter', 'parks', 'dist'],
      dtype='object')

    num_pipeline = Pipeline([
                            ('Merging' , FeatureMerger()),
                            ("imputer", SimpleImputer(strategy="median")),
                            ("scaler",  StandardScaler())

                            ])

In [27]:
num_pipeline = Pipeline([
                        ("imputer", SimpleImputer(strategy="median")),
                        ("scaler",  StandardScaler())

                        ])

In [28]:
ohe = OneHotEncoder()

In [29]:
cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])

### Main-Pipeline

In [30]:
pipeline = ColumnTransformer([
                            ("num pipeline", num_pipeline, num_feat), 
                            ("nominal transformer", cat_pipeline, cat_feat), 
    
                            ("drop columns", "drop", drop_cols) 
    
                            ])

## STEP 7 : Transforming

In [31]:
# X_train

In [32]:
X_train_tr = pipeline.fit_transform(X_train)

In [33]:
X_test_tr = pipeline.transform(X_test)

In [34]:
# X_train_tr[0]

In [35]:
out_cols = []

i= 0  # counter

for col in pipeline.get_feature_names_out():
    out_cols.append(col.split("__")[-1])
    #out_cols[i] = out_cols[i].split("_")[-1]
    
    i += 1
print(out_cols)

['crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'teachers', 'poor_prop', 'parks', 'dist', 'airport_NO', 'airport_YES', 'waterbody_Lake', 'waterbody_Lake and River', 'waterbody_None', 'waterbody_River']


In [36]:
len(out_cols)

15

In [37]:
# X_train_tr[0]

In [38]:
X_train_tr_df = pd.DataFrame(X_train_tr, columns=out_cols)

X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)

In [39]:
X_train_tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   crime_rate                404 non-null    float64
 1   resid_area                404 non-null    float64
 2   air_qual                  404 non-null    float64
 3   room_num                  404 non-null    float64
 4   age                       404 non-null    float64
 5   teachers                  404 non-null    float64
 6   poor_prop                 404 non-null    float64
 7   parks                     404 non-null    float64
 8   dist                      404 non-null    float64
 9   airport_NO                404 non-null    float64
 10  airport_YES               404 non-null    float64
 11  waterbody_Lake            404 non-null    float64
 12  waterbody_Lake and River  404 non-null    float64
 13  waterbody_None            404 non-null    float64
 14  waterbody_

In [40]:
X_test_tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   crime_rate                102 non-null    float64
 1   resid_area                102 non-null    float64
 2   air_qual                  102 non-null    float64
 3   room_num                  102 non-null    float64
 4   age                       102 non-null    float64
 5   teachers                  102 non-null    float64
 6   poor_prop                 102 non-null    float64
 7   parks                     102 non-null    float64
 8   dist                      102 non-null    float64
 9   airport_NO                102 non-null    float64
 10  airport_YES               102 non-null    float64
 11  waterbody_Lake            102 non-null    float64
 12  waterbody_Lake and River  102 non-null    float64
 13  waterbody_None            102 non-null    float64
 14  waterbody_

## STEP 8 :  Merging transformed train and test DF

In [41]:
# Datatypes of each column in respective DF must be same.

X_df_trans = pd.concat([X_train_tr_df,X_test_tr_df],ignore_index= True)

In [42]:
y_comb = pd.concat([y_train,y_test] ,axis = 0 ,ignore_index= True)

In [43]:
# y_comb

In [44]:
df_trans = pd.concat([X_df_trans,y_comb] ,axis = 1)

In [45]:
df_trans.shape

(506, 16)

In [46]:
df_trans.index

RangeIndex(start=0, stop=506, step=1)

## STEP 9 : Exporting Final Transformed DF

In [47]:
# Exporting Final Transformed DF

In [48]:
df_trans.to_csv("trans_DF_S4_Part2.csv")