In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [None]:
Dataset = pd.read_csv("/content/sample_data/california_housing_train.csv")


# importing an array of features
x = Dataset.iloc[:, :-1].values 
# importing an array of dependent variable
y = Dataset.iloc[:, -1].values

Let’s have a look at our data by executing the code:

In [None]:
print(x) # returns an array of features

[[-114.31     34.19     15.     ... 1015.      472.        1.4936]
 [-114.47     34.4      19.     ... 1129.      463.        1.82  ]
 [-114.56     33.69     17.     ...  333.      117.        1.6509]
 ...
 [-124.3      41.84     17.     ... 1244.      456.        3.0313]
 [-124.3      41.8      19.     ... 1298.      478.        1.9797]
 [-124.35     40.54     52.     ...  806.      270.        3.0147]]


In [None]:
# Importing the class called SimpleImputer from impute model in sklearn
from sklearn.impute import SimpleImputer
# To replace the missing value we create below object of SimpleImputer class
imputa = SimpleImputer(missing_values = np.nan, strategy = 'mean')
''' Using the fit method, we apply the `imputa` object on the matrix of our feature x.
The `fit()` method identifies the missing values and computes the mean of such feature a missing value is present.
'''
imputa.fit(x[:, 1:3])
# Repalcing the missing value using transform method
x[:, 1:3] = imputa.transform(x[:, 1:3])

Upon executing the code, we obtain a matrix of features with the missing values replaced.

In [None]:
print(x)

[[-114.31     34.19     15.     ... 1015.      472.        1.4936]
 [-114.47     34.4      19.     ... 1129.      463.        1.82  ]
 [-114.56     33.69     17.     ...  333.      117.        1.6509]
 ...
 [-124.3      41.84     17.     ... 1244.      456.        3.0313]
 [-124.3      41.8      19.     ... 1298.      478.        1.9797]
 [-124.35     40.54     52.     ...  806.      270.        3.0147]]


Let’s see how One-hot encoding enables us to achieve this by executing the code below:

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder= 'passthrough')
x = np.array(ct.fit_transform(x))

In [None]:
# executing the cell we obtain:
print(x)

  (0, 826)	1.0
  (0, 827)	34.19
  (0, 828)	15.0
  (0, 829)	5612.0
  (0, 830)	1283.0
  (0, 831)	1015.0
  (0, 832)	472.0
  (0, 833)	1.4936
  (1, 825)	1.0
  (1, 827)	34.4
  (1, 828)	19.0
  (1, 829)	7650.0
  (1, 830)	1901.0
  (1, 831)	1129.0
  (1, 832)	463.0
  (1, 833)	1.82
  (2, 824)	1.0
  (2, 827)	33.69
  (2, 828)	17.0
  (2, 829)	720.0
  (2, 830)	174.0
  (2, 831)	333.0
  (2, 832)	117.0
  (2, 833)	1.6509
  (3, 823)	1.0
  :	:
  (16996, 833)	2.5179
  (16997, 1)	1.0
  (16997, 827)	41.84
  (16997, 828)	17.0
  (16997, 829)	2677.0
  (16997, 830)	531.0
  (16997, 831)	1244.0
  (16997, 832)	456.0
  (16997, 833)	3.0313
  (16998, 1)	1.0
  (16998, 827)	41.8
  (16998, 828)	19.0
  (16998, 829)	2672.0
  (16998, 830)	552.0
  (16998, 831)	1298.0
  (16998, 832)	478.0
  (16998, 833)	1.9797
  (16999, 0)	1.0
  (16999, 827)	40.54
  (16999, 828)	52.0
  (16999, 829)	1820.0
  (16999, 830)	300.0
  (16999, 831)	806.0
  (16999, 832)	270.0
  (16999, 833)	3.0147


To encode our depended variable y, let’s run the code below:

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

[246 376 432 ... 611 433 521]
