## Data Cleaning - House Pricing.csv

### Importing libraries


In [1]:
import pandas as pd
df = pd.read_csv('HousePricing.csv')
num_nan_rows = df.isnull().sum(axis=1).astype(bool).sum()
total_rows = df.shape[0]
print('Number of records in total: {}\nNumber of records that has NaN values: {}'.format(total_rows, num_nan_rows))

Number of records in total: 545
Number of records that has NaN values: 0


### Dataset overview

In [2]:
df.head()
df.columns
for col in df.columns:
    print("Unique values in column '{}' are:".format(col))
    print(df[col].unique())

Unique values in column 'price' are:
[13300000 12250000 12215000 11410000 10850000 10150000  9870000  9800000
  9681000  9310000  9240000  9100000  8960000  8890000  8855000  8750000
  8680000  8645000  8575000  8540000  8463000  8400000  8295000  8190000
  8120000  8080940  8043000  7980000  7962500  7910000  7875000  7840000
  7700000  7560000  7525000  7490000  7455000  7420000  7350000  7343000
  7245000  7210000  7140000  7070000  7035000  7000000  6930000  6895000
  6860000  6790000  6755000  6720000  6685000  6650000  6629000  6615000
  6580000  6510000  6475000  6440000  6419000  6405000  6300000  6293000
  6265000  6230000  6195000  6160000  6125000  6107500  6090000  6083000
  6020000  5950000  5943000  5880000  5873000  5866000  5810000  5803000
  5775000  5740000  5652500  5600000  5565000  5530000  5523000  5495000
  5460000  5425000  5390000  5383000  5320000  5285000  5250000  5243000
  5229000  5215000  5145000  5110000  5075000  5040000  5033000  5005000
  4970000  495

### Non numerical -> numerical

In [3]:
df['mainroad'] = df['mainroad'].map({'yes': 1, 'no': 0})
df['guestroom'] = df['guestroom'].map({'yes': 1, 'no': 0})
df['basement'] = df['basement'].map({'yes': 1, 'no': 0})
df['hotwaterheating'] = df['hotwaterheating'].map({'yes': 1, 'no': 0})
df['airconditioning'] = df['airconditioning'].map({'yes': 1, 'no': 0})
df['prefarea'] = df['prefarea'].map({'yes': 1, 'no': 0})
df['furnishingstatus'] = df['furnishingstatus'].map({'unfurnished': 0, 'semi-furnished': 1, 'furnished': 2})
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,2
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,2
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,2
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,2


### Normalize data

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['id'] = [i for i in range(len(df))]
id = df.pop('id')
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_normalized = pd.concat([df_normalized, id.reset_index(drop=True)], axis=1)
last_col = df_normalized.pop(df_normalized.columns[-1])
df_normalized.insert(0, last_col.name, last_col)
df_normalized.head(20)

Unnamed: 0,id,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,0,1.0,0.396564,0.6,0.333333,0.666667,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,1.0
1,1,0.909091,0.502405,0.6,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,2,0.909091,0.571134,0.4,0.333333,0.333333,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,0.5
3,3,0.906061,0.402062,0.6,0.333333,0.333333,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
4,4,0.836364,0.396564,0.6,0.0,0.333333,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,1.0
5,5,0.787879,0.402062,0.4,0.666667,0.0,1.0,0.0,1.0,0.0,1.0,0.666667,1.0,0.5
6,6,0.727273,0.476289,0.6,0.666667,1.0,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,0.5
7,7,0.727273,1.0,0.8,0.666667,0.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,0.70303,0.443299,0.6,0.0,0.333333,1.0,1.0,1.0,0.0,1.0,0.666667,1.0,1.0
9,9,0.69697,0.281787,0.4,0.333333,1.0,1.0,1.0,0.0,0.0,1.0,0.333333,1.0,0.0


### Save to csv

In [5]:
df_normalized.to_csv('cleaned_data_house.csv', index=False)