In [1]:
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics

In [2]:
# load data
directory = '../data'
file_name = 'boston_housing.csv'
df = pd.read_csv(os.path.join(directory,file_name))

In [3]:
## Outliers handling with Winsorization method.
## This method uses winsorization to handle outliers 
## where the lowest 5% and highest 5% of values are replaced by value at corresponding percentiles (5th and 95th).
def my_outlier_winsorization(df):
    # select numeric columns
    numeric_columns = df.select_dtypes(include=np.number).columns
    for col in numeric_columns:
        df[col] = winsorize(df[col], limits=[0.05, 0.05],inclusive=(True, True), inplace=True)
    return df
df = my_outlier_winsorization(df)

In [4]:
df.describe()

  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,2.800115,11.077075,11.057648,0.06917,0.550513,6.286486,68.899802,3.741604,9.588933,408.897233,18.500988,359.427372,12.486403,22.394466
std,4.677993,22.377477,6.601417,0.253994,0.104906,0.584444,27.495342,1.945698,8.67048,166.460239,2.028046,81.478208,6.589962,8.312777
min,0.02763,0.0,2.18,0.0,0.409,5.304,17.7,1.4608,2.0,222.0,14.7,83.45,3.7,10.2
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,15.8603,80.0,21.89,1.0,0.74,7.61,100.0,7.8278,24.0,666.0,21.0,396.9,26.82,43.5


In [5]:
## Split dataset into Features (X) and target (y)
X = df.drop(columns='medv')
y = df['medv']
## split data into tran-test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2, random_state=42)

In [6]:
## perform Normalize/standardize using StandardScaler()
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# Transform train and test set
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the result (NumPy array) back to a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [7]:
# Train the Linear Regression model
## initilize the linear regression model
lr_model = LinearRegression()

## fit the model
lr_model.fit(X_train_scaled_df, y_train)

## Parameters of Linear regression
lr_model.coef_

array([-9.29335474e-01,  4.47306464e-01,  1.17682059e-03,  4.63208477e-01,
       -2.07606314e+00,  3.13284897e+00, -3.02265304e-01, -3.14281496e+00,
        2.30724128e+00, -1.53294869e+00, -1.71203371e+00,  1.04812416e+00,
       -3.18828941e+00])

In [8]:
## Make prediction
y_pred=lr_model.predict(X_test_scaled_df)

In [9]:
r2_score1 = metrics.r2_score(y_test, y_pred)
print(r2_score1)

0.7380846873248351


In [10]:
X.tail()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,2,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.12,76.7,2.2875,2,273,21.0,396.9,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,2,273,21.0,396.9,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,2,273,21.0,393.45,6.48
505,0.04741,0.0,11.93,0,0.573,6.03,80.8,2.505,2,273,21.0,396.9,7.88


# binarize nox value

In [11]:
## binarize nox vlaue
mean_nox = X['nox'].mean()
X['nox_binary'] = 0
X.loc[X['nox']>mean_nox , 'nox'] = 1

In [12]:
X.drop(columns='nox', inplace=True)

In [13]:
X.head()

Unnamed: 0,crim,zn,indus,chas,rm,age,dis,rad,tax,ptratio,b,lstat,nox_binary
0,0.02763,18.0,2.31,0,6.575,65.2,4.09,2,296,15.3,396.9,4.98,0
1,0.02763,0.0,7.07,0,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,0
2,0.02763,0.0,7.07,0,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,0
3,0.03237,0.0,2.18,0,6.998,45.8,6.0622,3,222,18.7,394.63,3.7,0
4,0.06905,0.0,2.18,0,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,0


In [14]:
## split data into tran-test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2, random_state=42)

In [15]:
## perform Normalize/standardize using StandardScaler()
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# Transform train and test set
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the result (NumPy array) back to a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [16]:
# Train the Linear Regression model
## initilize the linear regression model
lr_model = LinearRegression()

## fit the model
lr_model.fit(X_train_scaled_df, y_train)

In [17]:
## Make prediction
y_pred=lr_model.predict(X_test_scaled_df)

In [18]:
r2_score2 = metrics.r2_score(y_test, y_pred)
print(r2_score2)

0.7216603589442565


# binarize lstat value

In [19]:
## binarize nox vlaue
mean_lstat = X['lstat'].mean()
X['lstat_binary'] = 0
X.loc[X['lstat']>mean_lstat , 'lstat'] = 1

In [20]:
mean_lstat

np.float64(12.486403162055337)

In [21]:
X.head()

Unnamed: 0,crim,zn,indus,chas,rm,age,dis,rad,tax,ptratio,b,lstat,nox_binary,lstat_binary
0,0.02763,18.0,2.31,0,6.575,65.2,4.09,2,296,15.3,396.9,4.98,0,0
1,0.02763,0.0,7.07,0,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,0,0
2,0.02763,0.0,7.07,0,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,0,0
3,0.03237,0.0,2.18,0,6.998,45.8,6.0622,3,222,18.7,394.63,3.7,0,0
4,0.06905,0.0,2.18,0,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,0,0


In [22]:
X.drop(columns='lstat', inplace=True)

In [23]:
## split data into tran-test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2, random_state=42)

In [24]:
## perform Normalize/standardize using StandardScaler()
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# Transform train and test set
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the result (NumPy array) back to a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [25]:
# Train the Linear Regression model
## initilize the linear regression model
lr_model = LinearRegression()

## fit the model
lr_model.fit(X_train_scaled_df, y_train)

In [26]:
## Make prediction
y_pred=lr_model.predict(X_test_scaled_df)

In [27]:
r2_score3 = metrics.r2_score(y_test, y_pred)
print(r2_score3)

0.6836345129932434


## Comparing R^2 score

In [28]:
print('Original Features R^2 Score: ', r2_score1)
print('R^2 Score with nox feature binarization: ', r2_score2)
print('R^2 Score with nox & lstat features binarization: ', r2_score3)

Original Features R^2 Score:  0.7380846873248351
R^2 Score with nox feature binarization:  0.7216603589442565
R^2 Score with nox & lstat features binarization:  0.6836345129932434


- The new feature obtained by binarization of nox & lstat features makes the model performance worst.