In [1]:
import pandas as pd
import numpy as np

# ========== NaN ==========

In [2]:
data_nan = pd.read_csv('house_price_NaN.csv')

data_nan

Unnamed: 0,NumRooms,Area,SalePrice
0,4.0,,1114
1,4.0,110.0,1088
2,4.0,117.0,1462
3,3.0,93.0,123
4,,92.0,1378
5,3.0,,726
6,6.0,96.0,1649


## Check NaN

In [3]:
data_nan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   NumRooms   6 non-null      float64
 1   Area       5 non-null      float64
 2   SalePrice  7 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 296.0 bytes


## Listwise deletion

In [4]:
data = data_nan.dropna(axis=0)

In [5]:
data

Unnamed: 0,NumRooms,Area,SalePrice
1,4.0,110.0,1088
2,4.0,117.0,1462
3,3.0,93.0,123
6,6.0,96.0,1649


# ========== Outlier ==========

In [6]:
data_outlier = pd.read_csv('house_price_outlier.csv')

data_outlier

Unnamed: 0,NumRooms,Area,SalePrice
0,-300,-100,560
1,4,107,1388
2,3,105,1013
3,5,114,1811
4,100000,100,1344
5,3,900000,1055
6,3,105,820


## Check Outliers

In [7]:
data_outlier.describe()

Unnamed: 0,NumRooms,Area,SalePrice
count,7.0,7.0,7.0
mean,14245.428571,128633.0,1141.571429
std,37814.38091,340140.883966,411.399586
min,-300.0,-100.0,560.0
25%,3.0,102.5,916.5
50%,3.0,105.0,1055.0
75%,4.5,110.5,1366.0
max,100000.0,900000.0,1811.0


## Remove Outliers

In [8]:
_filter = (0 < data_outlier['NumRooms']) & (data_outlier['NumRooms'] < 10) & (0 < data_outlier['Area']) & (data_outlier['Area'] < 1000)
data = data_outlier[_filter]

In [9]:
data

Unnamed: 0,NumRooms,Area,SalePrice
1,4,107,1388
2,3,105,1013
3,5,114,1811
6,3,105,820


# ========== Ordinal Encoding ==========

In [10]:
data = pd.read_csv('ordinal_data.csv')

data

Unnamed: 0,grade,medal
0,B,gold
1,A,gold
2,B,silver
3,D,bronze
4,F,bronze
5,C,silver


In [11]:
feature_name = data.columns

In [12]:
from sklearn.preprocessing import OrdinalEncoder

categories = [
    np.array(['F', 'D', 'C', 'B', 'A']),
    np.array(['bronze', 'silver', 'gold'])
]

ordinal_encoder = OrdinalEncoder(categories=categories)
data_transformed = ordinal_encoder.fit_transform(data)

data_transformed = pd.DataFrame(data_transformed, columns=feature_name)

In [13]:
data_transformed

Unnamed: 0,grade,medal
0,3.0,2.0
1,4.0,2.0
2,3.0,1.0
3,1.0,0.0
4,0.0,0.0
5,2.0,1.0


In [14]:
ordinal_encoder.categories_

[array(['F', 'D', 'C', 'B', 'A'], dtype=object),
 array(['bronze', 'silver', 'gold'], dtype=object)]

# ========== One Hot Encoding ==========

In [15]:
data = pd.read_csv('nominal_data.csv')

data

Unnamed: 0,sex,transport
0,male,bus
1,female,train
2,female,car
3,male,train
4,female,bus
5,male,bus


In [16]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
data_transformed = one_hot_encoder.fit_transform(data)

data_transformed = pd.DataFrame(data_transformed, 
                                columns=['female', 'male', 'bus', 'car', 'train'])

In [17]:
data_transformed

Unnamed: 0,female,male,bus,car,train
0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0
5,0.0,1.0,1.0,0.0,0.0


In [18]:
one_hot_encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['bus', 'car', 'train'], dtype=object)]

## ========== Standardization ==========

In [19]:
data = pd.read_csv('house_price.csv')

data

Unnamed: 0,NumRooms,Area
0,4,99
1,4,110
2,4,117
3,3,93
4,5,92
5,3,99
6,6,96


In [20]:
feature_name = data.columns

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

data_scaled = pd.DataFrame(data_scaled, columns=feature_name)

In [22]:
data_scaled

Unnamed: 0,NumRooms,Area
0,-0.144338,-0.216546
1,-0.144338,1.066075
2,-0.144338,1.882288
3,-1.154701,-0.916158
4,0.866025,-1.03276
5,-1.154701,-0.216546
6,1.876388,-0.566352


In [23]:
data_scaled.mean()

NumRooms   -3.806479e-16
Area       -4.758099e-16
dtype: float64

In [24]:
data_scaled.std()

NumRooms    1.080123
Area        1.080123
dtype: float64

In [25]:
print('mean =', scaler.mean_)
print('var =', scaler.var_)
print('std =', np.sqrt(scaler.var_))

mean = [  4.14285714 100.85714286]
var = [ 0.97959184 73.55102041]
std = [0.98974332 8.57618915]


## ========== Min-Max Scaling ==========

In [26]:
data = pd.read_csv('house_price.csv')

data

Unnamed: 0,NumRooms,Area
0,4,99
1,4,110
2,4,117
3,3,93
4,5,92
5,3,99
6,6,96


In [27]:
feature_name = data.columns

In [28]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

data_scaled = pd.DataFrame(data_scaled, columns=feature_name)

In [29]:
data_scaled

Unnamed: 0,NumRooms,Area
0,0.333333,0.28
1,0.333333,0.72
2,0.333333,1.0
3,0.0,0.04
4,0.666667,0.0
5,0.0,0.28
6,1.0,0.16


In [30]:
data_scaled.min()

NumRooms    0.0
Area        0.0
dtype: float64

In [31]:
data_scaled.max()

NumRooms    1.0
Area        1.0
dtype: float64

In [32]:
print('min =', scaler.data_min_)
print('max =', scaler.data_max_)

min = [ 3. 92.]
max = [  6. 117.]
