In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
filepath = r"S2_Part2_test_DataPreprocessing_Trimming1.csv"

houses = pd.read_csv(filepath , header = 0 , index_col= 0 )

In [3]:
houses.reset_index(inplace = True , drop = True)

In [4]:
houses.index

RangeIndex(start=0, stop=17358, step=1)

In [5]:
houses.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,median_income,median_house_value,ocean_proximity,bedrooms_per_room,population_per_household
0,-122.24,37.85,52.0,1467.0,496.0,7.2574,352100.0,NEAR BAY,0.129516,2.80226
1,-122.25,37.85,52.0,1274.0,558.0,5.6431,341300.0,NEAR BAY,0.184458,2.547945
2,-122.25,37.85,52.0,1627.0,565.0,3.8462,342200.0,NEAR BAY,0.172096,2.181467


In [6]:
def num_cat_col_list(DF) : 
    
    from pandas.api.types import is_string_dtype, is_numeric_dtype

    
    num_list = []
    cat_list = []

    for column in DF:

        if is_numeric_dtype(DF[column]):
            num_list.append(column)

        elif is_string_dtype(DF[column]):
            cat_list.append(column)

    return (num_list , cat_list)

In [7]:
num_list , cat_list = num_cat_col_list(houses)

print("Numerical column : ",num_list)
print("Categorical column : ",cat_list)

Numerical column :  ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'median_income', 'median_house_value', 'bedrooms_per_room', 'population_per_household']
Categorical column :  ['ocean_proximity']


## STEP 1 : Package importing 

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

## STEP 2 : Features and Target split

In [9]:
df = houses.copy()

## STEP 2 : Train - Test Split

In [10]:
# np.ceil(df['median_income']).unique()

In [11]:
df['MI'] = np.ceil(df['median_income'])

In [12]:
df['MI'].value_counts()

4.0    4431
3.0    4306
5.0    3102
2.0    2068
6.0    1836
7.0    1007
8.0     448
1.0     149
9.0      11
Name: MI, dtype: int64

In [13]:
df['MI_final'] = np.where(df['MI'] >6.0 ,7.0 ,df['MI'])

In [14]:
df['MI_final'].value_counts()

4.0    4431
3.0    4306
5.0    3102
2.0    2068
6.0    1836
7.0    1466
1.0     149
Name: MI_final, dtype: int64

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit

In [16]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [17]:
for train_index, test_index in split.split(df, df['MI_final']):
    
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [18]:
strat_train_set.shape

(13886, 12)

In [19]:
strat_test_set.shape

(3472, 12)

In [20]:
for dataframe in [strat_train_set , strat_test_set]:
    
    dataframe.drop(labels = ['MI', 'MI_final'] , axis = 1 , inplace = True)
    

In [21]:
X = df.drop(labels = ['median_house_value'] ,axis = 1)
y = df['median_house_value']

In [22]:
strat_train_set.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'population', 'median_income', 'median_house_value', 'ocean_proximity',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [23]:
strat_test_set.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'population', 'median_income', 'median_house_value', 'ocean_proximity',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [18]:
strat_train_set.shape

(13886, 12)

In [19]:
strat_test_set.shape

(3472, 12)

In [24]:
X_train = strat_train_set.drop(labels = ['median_house_value'] ,axis = 1)
y_train = strat_train_set['median_house_value']

X_test =  strat_test_set.drop(labels = ['median_house_value'] ,axis = 1)
y_test =  strat_test_set['median_house_value']

In [25]:
X_train.shape , y_train.shape

((13886, 9), (13886,))

In [26]:
X_test.shape , y_test.shape

((3472, 9), (3472,))

## STEP 4 : Making changes to num_feat and cat_feat list

In [10]:
# num_list

In [28]:
num_feat = num_list.copy()
num_feat.remove('median_house_value')

cat_feat = cat_list.copy()

## STEP 5 : PipeLine

### Pipelines :  numerical and categorical features

In [29]:
ohe = OneHotEncoder()

In [30]:
num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median')) ,
                         ("scaling", StandardScaler())
                        ])

In [31]:
cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])

### Main-Pipeline

In [32]:
pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                             ("Nominal pipeline" , cat_pipeline , cat_feat)
                             ])

### Transforming

In [33]:
X_train_tr = pipeline.fit_transform(X_train)

In [34]:
X_test_tr = pipeline.transform(X_test)

## STEP 6 : Create DF from transformed 2D-array

In [35]:
feat = pipeline.get_feature_names_out()
feat

array(['Num pipeline__longitude', 'Num pipeline__latitude',
       'Num pipeline__housing_median_age', 'Num pipeline__total_rooms',
       'Num pipeline__population', 'Num pipeline__median_income',
       'Num pipeline__bedrooms_per_room',
       'Num pipeline__population_per_household',
       'Nominal pipeline__ocean_proximity_<1H OCEAN',
       'Nominal pipeline__ocean_proximity_INLAND',
       'Nominal pipeline__ocean_proximity_ISLAND',
       'Nominal pipeline__ocean_proximity_NEAR BAY',
       'Nominal pipeline__ocean_proximity_NEAR OCEAN'], dtype=object)

In [36]:
out_cols = []

i= 0  # counter

for col in pipeline.get_feature_names_out():
    out_cols.append(col.split("__")[-1])
    #out_cols[i] = out_cols[i].split("_")[-1]
    
    i += 1
print(out_cols)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'median_income', 'bedrooms_per_room', 'population_per_household', 'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND', 'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN']


In [37]:
len(out_cols)

13

In [38]:
X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)

X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)

## STEP 7 :  Merging transformed train and test DF

In [39]:
# Datatypes of each column in respective DF must be same.

X_df_trans = pd.concat([X_train_tr_df,X_test_tr_df],ignore_index= True ,axis = 0)

In [40]:
X_df_trans.shape

(17358, 13)

In [24]:
# X_df_trans.head(2)

In [41]:
y_comb = pd.concat([y_train,y_test] ,axis = 0 ,ignore_index= True)

In [42]:
y_comb.shape

(17358,)

In [43]:
df_trans = pd.concat([X_df_trans,y_comb] ,axis = 1)

In [44]:
df_trans.shape

(17358, 14)

## STEP 8 :  Exporting Final Transformed DF

In [45]:
df_trans.to_csv("S2_Part3_test_Trimming_DataPreprocessing2.csv")