In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
filepath = r"S4_Part2_DataPreprocessing_Trimming1.csv"

houses = pd.read_csv(filepath , header = 0 , index_col= 0 )

In [3]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

num_list = []
cat_list = []

for column in houses:
    
    if is_numeric_dtype(houses[column]):
        num_list.append(column)
        
    elif is_string_dtype(houses[column]):
        cat_list.append(column)



print('Numerical attributes : ',num_list,'\n')

print('Categorical attributes : ',cat_list)

Numerical attributes :  ['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'teachers', 'poor_prop', 'dist'] 

Categorical attributes :  ['airport', 'waterbody']


## STEP 1 : Package importing 

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

## STEP 2 : Features and Target split

In [5]:
df = houses.copy()

In [6]:
X = df.drop(labels = ['price'] ,axis = 1)
y = df['price']

## STEP 3 : Train - Test Split

In [7]:
X_train ,X_test ,y_train ,y_test = train_test_split(X ,y ,test_size= 0.2 , random_state= 123)

In [8]:
X_train.shape ,y_train.shape

((385, 10), (385,))

In [9]:
X_test.shape , y_test.shape

((97, 10), (97,))

## STEP 4 : Making changes to num_feat and cat_feat list

In [10]:
# num_list

In [11]:
num_feat = num_list.copy()
num_feat.remove('price')

cat_feat = cat_list.copy()

## STEP 5 : PipeLine

### Pipelines :  numerical and categorical features

In [12]:
ohe = OneHotEncoder()

In [13]:
num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median')) ,
                         ("scaling", StandardScaler())
                        ])

In [14]:
cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])

### Main-Pipeline

In [15]:
pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                             ("Nominal pipeline" , cat_pipeline , cat_feat)
                             ])

### Transforming

In [16]:
X_train_tr = pipeline.fit_transform(X_train)

In [17]:
X_test_tr = pipeline.transform(X_test)

## STEP 6 : Create DF from transformed 2D-array

In [18]:
feat = pipeline.get_feature_names_out()
feat

array(['Num pipeline__crime_rate', 'Num pipeline__resid_area',
       'Num pipeline__air_qual', 'Num pipeline__room_num',
       'Num pipeline__age', 'Num pipeline__teachers',
       'Num pipeline__poor_prop', 'Num pipeline__dist',
       'Nominal pipeline__airport_NO', 'Nominal pipeline__airport_YES',
       'Nominal pipeline__waterbody_Lake',
       'Nominal pipeline__waterbody_Lake and River',
       'Nominal pipeline__waterbody_None',
       'Nominal pipeline__waterbody_River'], dtype=object)

In [19]:
out_cols = []

i= 0  # counter

for col in pipeline.get_feature_names_out():
    out_cols.append(col.split("__")[-1])
    #out_cols[i] = out_cols[i].split("_")[-1]
    
    i += 1
print(out_cols)

['crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'teachers', 'poor_prop', 'dist', 'airport_NO', 'airport_YES', 'waterbody_Lake', 'waterbody_Lake and River', 'waterbody_None', 'waterbody_River']


In [20]:
len(out_cols)

14

In [21]:
X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)

X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)

## STEP 7 :  Merging transformed train and test DF

In [22]:
# Datatypes of each column in respective DF must be same.

X_df_trans = pd.concat([X_train_tr_df,X_test_tr_df],ignore_index= True ,axis = 0)

In [23]:
X_df_trans.shape

(482, 14)

In [24]:
# X_df_trans.head(2)

In [25]:
y_comb = pd.concat([y_train,y_test] ,axis = 0 ,ignore_index= True)

In [26]:
y_comb.shape

(482,)

In [27]:
df_trans = pd.concat([X_df_trans,y_comb] ,axis = 1)

In [28]:
df_trans.shape

(482, 15)

## STEP 8 :  Exporting Final Transformed DF

In [29]:
df_trans.to_csv("S4_Part3_Trimming_DataPreprocessing2.csv")