## PREPROCESSING

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#Loding the data
housing= pd.read_csv("/Users/sumanshrestha/Documents/AI Class Omdena/machine-learning-introduction-makaisuman/data/boston_housing.csv")

In [3]:
#checking the data information
housing.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CRIM       506 non-null    float64
 1   ZN         506 non-null    float64
 2   INDUS      506 non-null    float64
 3   CHAS       506 non-null    int64  
 4   NOX        506 non-null    float64
 5   RM         506 non-null    float64
 6   AGE        506 non-null    float64
 7   DIS        506 non-null    float64
 8   RAD        506 non-null    int64  
 9   TAX        506 non-null    int64  
 10  PTRATIO    506 non-null    float64
 11  LSTAT      506 non-null    float64
 12  MEDV       506 non-null    float64
 13  CAT. MEDV  506 non-null    int64  
dtypes: float64(10), int64(4)
memory usage: 55.5 KB


In [4]:
#Since CAT.MEDV is not required removing it
housing= housing.drop(columns=['CAT. MEDV'])


In [5]:
#Check the missing values
housing.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
LSTAT      0
MEDV       0
dtype: int64

In [6]:
# Handeling the missing values and putting median as missing values
housing.fillna(housing.median(), inplace=True) 

In [7]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [8]:
### Handle Outliers using IQR Method ###
Q1 = housing.quantile(0.25)
Q3 = housing.quantile(0.75)
IQR = Q3 - Q1

In [9]:
# Define outlier range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [10]:
# Remove outliers
housing = housing[~((housing < lower_bound) | (housing > upper_bound)).any(axis=1)]

In [11]:
# Checking the data details and type before endoding
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 309 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     309 non-null    float64
 1   ZN       309 non-null    float64
 2   INDUS    309 non-null    float64
 3   CHAS     309 non-null    int64  
 4   NOX      309 non-null    float64
 5   RM       309 non-null    float64
 6   AGE      309 non-null    float64
 7   DIS      309 non-null    float64
 8   RAD      309 non-null    int64  
 9   TAX      309 non-null    int64  
 10  PTRATIO  309 non-null    float64
 11  LSTAT    309 non-null    float64
 12  MEDV     309 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 33.8 KB


## Encoding

In [12]:
# Converting all the data to float
# we need to convert the CHAS, RAD and TAX

housing["CHAS"] =housing["CHAS"].astype(float)  # for CHAS
housing["RAD"] =housing["RAD"].astype(float) # for RAD
housing["TAX"] =housing["TAX"].astype(float)  #For TAX

In [13]:
# Checking the data details and type after endoding
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 309 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     309 non-null    float64
 1   ZN       309 non-null    float64
 2   INDUS    309 non-null    float64
 3   CHAS     309 non-null    float64
 4   NOX      309 non-null    float64
 5   RM       309 non-null    float64
 6   AGE      309 non-null    float64
 7   DIS      309 non-null    float64
 8   RAD      309 non-null    float64
 9   TAX      309 non-null    float64
 10  PTRATIO  309 non-null    float64
 11  LSTAT    309 non-null    float64
 12  MEDV     309 non-null    float64
dtypes: float64(13)
memory usage: 33.8 KB


##  Normalize/Standardization numerical features.

In [14]:
# Standarizing  the data
scaler = StandardScaler()
scaled_data= scaler.fit_transform(housing)
scaled_df = pd.DataFrame(scaled_data, columns=housing.columns)


In [15]:
# Data after Standardization
scaled_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,-0.608999,1.781364,-1.414075,0.0,-0.117927,1.008847,-0.141641,0.165466,-0.918033,-0.585887,-1.894956,-1.368384,0.66715
1,-0.599834,-0.417714,-0.686066,0.0,-0.767471,0.650602,0.378357,0.665968,-0.787128,-0.93704,-0.549131,-0.632685,0.183893
2,-0.599842,-0.417714,-0.686066,0.0,-0.767471,2.427871,-0.297261,0.665968,-0.787128,-0.93704,-0.549131,-1.536393,2.821672
3,-0.597624,-0.417714,-1.433958,0.0,-0.871021,1.992859,-0.877989,1.290868,-0.656222,-1.067097,-0.064634,-1.72916,2.559908
4,-0.581608,-0.417714,-1.433958,0.0,-0.871021,2.339473,-0.559158,1.290868,-0.656222,-1.067097,-0.064634,-1.306487,3.123708


## Split the data into training and testing sets.

In [16]:
# Split my data into the features (X) and target y
X = scaled_df.drop(columns = ['MEDV'])
y = scaled_df['MEDV']

In [17]:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,-0.608999,1.781364,-1.414075,0.0,-0.117927,1.008847,-0.141641,0.165466,-0.918033,-0.585887,-1.894956,-1.368384
1,-0.599834,-0.417714,-0.686066,0.0,-0.767471,0.650602,0.378357,0.665968,-0.787128,-0.937040,-0.549131,-0.632685
2,-0.599842,-0.417714,-0.686066,0.0,-0.767471,2.427871,-0.297261,0.665968,-0.787128,-0.937040,-0.549131,-1.536393
3,-0.597624,-0.417714,-1.433958,0.0,-0.871021,1.992859,-0.877989,1.290868,-0.656222,-1.067097,-0.064634,-1.729160
4,-0.581608,-0.417714,-1.433958,0.0,-0.871021,2.339473,-0.559158,1.290868,-0.656222,-1.067097,-0.064634,-1.306487
...,...,...,...,...,...,...,...,...,...,...,...,...
304,-0.584411,-0.417714,0.057237,0.0,0.211552,1.050720,0.006387,-0.754052,-0.918033,-0.735452,1.173525,-0.538954
305,-0.591991,-0.417714,0.057237,0.0,0.211552,-0.049604,0.294854,-0.863100,-0.918033,-0.735452,1.173525,-0.643296
306,-0.585228,-0.417714,0.057237,0.0,0.211552,1.941681,0.837626,-0.931576,-0.918033,-0.735452,1.173525,-1.251663
307,-0.563906,-0.417714,0.057237,0.0,0.211552,1.518300,0.773100,-0.805238,-0.918033,-0.735452,1.173525,-1.103108


In [18]:
# Split the data into the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)