In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
filepath_House_Price = r"C:\Users\aakas\PythonStuff\Regression_udemy\resources\Linear_Regression\House_Price.csv"


houses = pd.read_csv(filepath_House_Price , header = 0 )

In [3]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

num_list = []
cat_list = []

for column in houses:
    
    if is_numeric_dtype(houses[column]):
        num_list.append(column)
        
    elif is_string_dtype(houses[column]):
        cat_list.append(column)



print('Numerical attributes : ',num_list,'\n')

print('Categorical attributes : ',cat_list)

Numerical attributes :  ['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 'n_hos_beds', 'n_hot_rooms', 'rainfall', 'parks'] 

Categorical attributes :  ['airport', 'waterbody', 'bus_ter']


## Package importing 

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

# 1. Outlier Treatment 

In [5]:
# Z-score for Normally or almost normally distributed data

nor_col = ['room_num',]
skew_col = ['dist1', 'dist2', 'dist3', 'dist4','poor_prop' , 'parks']

## PART 1 : Trimming

In [6]:
df_trim = houses.copy()

In [7]:
df_trim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        506 non-null    float64
 1   crime_rate   506 non-null    float64
 2   resid_area   506 non-null    float64
 3   air_qual     506 non-null    float64
 4   room_num     506 non-null    float64
 5   age          506 non-null    float64
 6   dist1        506 non-null    float64
 7   dist2        506 non-null    float64
 8   dist3        506 non-null    float64
 9   dist4        506 non-null    float64
 10  teachers     506 non-null    float64
 11  poor_prop    506 non-null    float64
 12  airport      506 non-null    object 
 13  n_hos_beds   498 non-null    float64
 14  n_hot_rooms  506 non-null    float64
 15  waterbody    506 non-null    object 
 16  rainfall     506 non-null    int64  
 17  bus_ter      506 non-null    object 
 18  parks        506 non-null    float64
dtypes: float

In [8]:
# loop for normally or almost normally distributed data
for col in nor_col :
    
    # Finding mean and Std
    mean_col = df_trim[col].mean()
    std_col = df_trim[col].std()
    
    # Finding lower and upper limits
    lower_limit = mean_col - 3*std_col
    upper_limit = mean_col + 3*std_col
    
    df_trim = df_trim[(df_trim[col] > lower_limit)  & (df_trim[col] < upper_limit)]
    

# loop for skew data
for col in skew_col :
    
    # Finding IQR
    percentile25 = df_trim[col].quantile(0.25)
    percentile75 = df_trim[col].quantile(.75)
    
    # Compute IQR
    IQR = percentile75 - percentile25
    
    # Finding lower and upper limits
    lowerlimit = percentile25 - 1.5*IQR
    upperlimit = percentile75 + 1.5*IQR
    
    # Trimming
    df_trim = df_trim[(df_trim[col]> lowerlimit) & (df_trim[col] < upperlimit) ]
    
    
    # min_cgpa = df_trim['cgpa'].min()
    # max_cgpa = df_trim['cgpa'].max()

In [9]:
df_trim.shape

(482, 19)

### Features and Target split

In [10]:
df1 = df_trim.copy()

In [11]:
X = df1.drop(labels = ['price'] ,axis = 1)
y = df1['price']

### Train - Test Split

In [12]:
X_train ,X_test ,y_train ,y_test = train_test_split(X ,y ,test_size= 0.2 , random_state= 123)

In [13]:
X_train.shape ,y_train.shape

((385, 18), (385,))

In [14]:
X_test.shape , y_test.shape

((97, 18), (97,))

### Making changes to num_feat and cat_feat list

In [15]:
# num_list

In [16]:
num_feat = num_list.copy()
num_feat.remove('price')

cat_feat = cat_list.copy()

In [17]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 482 entries, 0 to 505
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        482 non-null    float64
 1   crime_rate   482 non-null    float64
 2   resid_area   482 non-null    float64
 3   air_qual     482 non-null    float64
 4   room_num     482 non-null    float64
 5   age          482 non-null    float64
 6   dist1        482 non-null    float64
 7   dist2        482 non-null    float64
 8   dist3        482 non-null    float64
 9   dist4        482 non-null    float64
 10  teachers     482 non-null    float64
 11  poor_prop    482 non-null    float64
 12  airport      482 non-null    object 
 13  n_hos_beds   474 non-null    float64
 14  n_hot_rooms  482 non-null    float64
 15  waterbody    482 non-null    object 
 16  rainfall     482 non-null    int64  
 17  bus_ter      482 non-null    object 
 18  parks        482 non-null    float64
dtypes: float

### Pipelines :  numerical and categorical features

In [18]:
ohe = OneHotEncoder()

In [19]:
num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median'))                       
                        ])

In [20]:
cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])

### Main-Pipeline

In [21]:
pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                             ("Nominal pipeline" , cat_pipeline , cat_feat)
                             ])

### Transforming

In [22]:
X_train_tr = pipeline.fit_transform(X_train)

In [23]:
X_test_tr = pipeline.transform(X_test)

### Create DF from transformed 2D-array

In [24]:
feat = pipeline.get_feature_names_out()
feat

array(['Num pipeline__crime_rate', 'Num pipeline__resid_area',
       'Num pipeline__air_qual', 'Num pipeline__room_num',
       'Num pipeline__age', 'Num pipeline__dist1', 'Num pipeline__dist2',
       'Num pipeline__dist3', 'Num pipeline__dist4',
       'Num pipeline__teachers', 'Num pipeline__poor_prop',
       'Num pipeline__n_hos_beds', 'Num pipeline__n_hot_rooms',
       'Num pipeline__rainfall', 'Num pipeline__parks',
       'Nominal pipeline__airport_NO', 'Nominal pipeline__airport_YES',
       'Nominal pipeline__waterbody_Lake',
       'Nominal pipeline__waterbody_Lake and River',
       'Nominal pipeline__waterbody_None',
       'Nominal pipeline__waterbody_River',
       'Nominal pipeline__bus_ter_YES'], dtype=object)

In [25]:
out_cols = []

i= 0  # counter

for col in pipeline.get_feature_names_out():
    out_cols.append(col.split("__")[-1])
    #out_cols[i] = out_cols[i].split("_")[-1]
    
    i += 1
print(out_cols)

['crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 'n_hos_beds', 'n_hot_rooms', 'rainfall', 'parks', 'airport_NO', 'airport_YES', 'waterbody_Lake', 'waterbody_Lake and River', 'waterbody_None', 'waterbody_River', 'bus_ter_YES']


In [26]:
len(out_cols)

22

In [27]:
X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)

X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)

### Merging transformed train and test DF

In [28]:
# Datatypes of each column in respective DF must be same.

X_df_trans = pd.concat([X_train_tr_df,X_test_tr_df],ignore_index= True ,axis = 0)

In [29]:
X_df_trans.shape

(482, 22)

In [30]:
# X_df_trans.head(2)

In [31]:
y_comb = pd.concat([y_train,y_test] ,axis = 0 ,ignore_index= True)

In [32]:
y_comb.shape

(482,)

In [33]:
df1_trans = pd.concat([X_df_trans,y_comb] ,axis = 1)

In [34]:
df1_trans.shape

(482, 23)

### Exporting Final Transformed DF

In [35]:
df1_trans.to_csv("trans_DF_S2_Part2.csv")