# Data Proprocessing

### Reading Dataset

In [1]:
# creating an artificial dataset that is stored in csv format
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n') # Column names
    f.write('NA,Pave,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [2]:
# load the raw csv dataset created
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


### Handling Missing Data

In [3]:
# split data into inputs and outputs
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
# replace the "NaN" entries with the mean value of the same column
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN


In [4]:
# for categorical or discrete values in inputs, we consider "NaN" as category
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1


### Conversion to the Tensor Format

In [5]:
# all the columns in inputs and outputs in numerical format, they can be converted into tensor
import tensorflow as tf

X, y = tf.constant(inputs.values), tf.constant(outputs.values)
X, y

(<tf.Tensor: shape=(4, 3), dtype=float64, numpy=
 array([[3., 1., 0.],
        [2., 0., 1.],
        [4., 0., 1.],
        [3., 0., 1.]])>,
 <tf.Tensor: shape=(4,), dtype=int64, numpy=array([127500, 106000, 178100, 140000], dtype=int64)>)

# Exercises
Create a raw dataset with more rows and columns.

1. Delete the column with the most missing values.
2. Convert the preprocessed dataset into tensor format.

In [32]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'exercise.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Rating,Price\n')  # Column names
    f.write('NA,Pave,4.3,127500\n')  # Each row represents a data example
    f.write('2,NA,NA,106000\n')
    f.write('4,NA,NA,178100\n')
    f.write('NA,New,NA,140000\n')

In [33]:
import pandas as pd
data = pd.read_csv(data_file)
print(data)

   NumRooms Alley  Rating   Price
0       NaN  Pave     4.3  127500
1       2.0   NaN     NaN  106000
2       4.0   NaN     NaN  178100
3       NaN   New     NaN  140000


In [34]:
inputs, outputs = data.iloc[:, 0:3], data.iloc[:, 3]
print(inputs)

   NumRooms Alley  Rating
0       NaN  Pave     4.3
1       2.0   NaN     NaN
2       4.0   NaN     NaN
3       NaN   New     NaN


In [35]:
# del inputs['Rating']

In [36]:
inputs

Unnamed: 0,NumRooms,Alley,Rating
0,,Pave,4.3
1,2.0,,
2,4.0,,
3,,New,


In [37]:
inputs.columns

Index(['NumRooms', 'Alley', 'Rating'], dtype='object')

In [38]:
del inputs['Rating']

In [39]:
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,New


In [40]:
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms Alley
0       3.0  Pave
1       2.0   NaN
2       4.0   NaN
3       3.0   New


In [41]:
inputs = pd.get_dummies(inputs, columns=['Alley'], dummy_na=True)
print(inputs)

   NumRooms  Alley_New  Alley_Pave  Alley_nan
0       3.0          0           1          0
1       2.0          0           0          1
2       4.0          0           0          1
3       3.0          1           0          0


In [None]:
import tensorflow as tf

X, y = tf.co