### This project we will do some simple preprocessing. We will do one project soley based on dealing with Null values, such as various imputation methods. 

In [2]:
# Import some packages. 

import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
# load and explore the WestRoxbury data

housing_df = pd.read_csv('WestRoxbury.csv')

Unnamed: 0,TOTAL VALUE,TAX,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,4330,9965,1880,2436,1352,2.0,6,3,1,1,1,0,
1,412.6,5190,6590,1945,3108,1976,2.0,10,4,2,1,1,0,Recent
2,330.1,4152,7500,1890,2294,1371,2.0,8,4,1,1,1,0,
3,498.6,6272,13773,1957,5032,2608,1.0,9,5,1,1,1,1,
4,331.5,4170,5000,1910,2370,1438,2.0,7,3,2,0,1,0,


In [4]:
# Look at the shape.

housing_df.shape

(5802, 14)

In [7]:
# lets just make sure there aren't any null values in this dataset.

housing_df.isnull().sum()

TOTAL VALUE     0
TAX             0
LOT SQFT        0
YR BUILT        0
GROSS AREA      0
LIVING AREA     0
FLOORS          0
ROOMS           0
BEDROOMS        0
FULL BATH       0
HALF BATH       0
KITCHEN         0
FIREPLACE       0
REMODEL         0
dtype: int64

In [12]:
# Now lets examine what the data types look like 

housing_df.dtypes

TOTAL VALUE      float64
TAX                int64
LOT SQFT           int64
YR BUILT           int64
GROSS AREA         int64
LIVING AREA        int64
FLOORS           float64
ROOMS              int64
BEDROOMS           int64
FULL BATH          int64
HALF BATH          int64
KITCHEN            int64
FIREPLACE          int64
REMODEL         category
dtype: object

In [11]:
# Let us change the Remodel variable from object to categorical 

housing_df.REMODEL = housing_df.REMODEL.astype('category')

In [13]:
# Now that we converted Remodel into categorical we might want to create dummies for some future projects, 
# such as maybe soem clustering algorithms

housing_df = pd.get_dummies(housing_df,drop_first=True)

In [17]:
# Let us now that the dataset and normalize it so that we have a standard scale for any future taks such as linear regression

scaler = StandardScaler()

norm_df = pd.DataFrame(scaler.fit_transform(housing_df), index=housing_df.index, columns=housing_df.columns)

Unnamed: 0,TOTAL VALUE,TAX,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL_Old,REMODEL_Recent
0,-0.488921,-0.488550,1.381138,-1.576825,-0.553046,-0.564507,0.710966,-0.692039,-0.271806,-0.568577,0.723264,-0.124814,-1.309450,-0.333589,-0.421418
1,0.200812,0.200806,0.116845,0.229392,0.207214,0.590172,0.710966,2.090505,0.909482,1.347151,0.723264,-0.124814,-1.309450,-0.333589,2.372943
2,-0.631102,-0.631230,0.457736,-1.298946,-0.713696,-0.529349,0.710966,0.699233,0.909482,-0.568577,0.723264,-0.124814,-1.309450,-0.333589,-0.421418
3,1.068020,1.068112,2.807634,0.562847,2.383909,1.759654,-1.537005,1.394869,2.090769,-0.568577,0.723264,-0.124814,0.460275,-0.333589,-0.421418
4,-0.616985,-0.616802,-0.478777,-0.743187,-0.627714,-0.405369,0.710966,0.003597,-0.271806,1.347151,-1.150120,-0.124814,-1.309450,-0.333589,-0.421418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5797,0.122158,0.122252,0.181277,0.034876,-0.374295,0.105355,0.710966,1.394869,-0.271806,1.347151,0.723264,-0.124814,0.460275,-0.333589,2.372943
5798,0.153418,0.153513,1.172483,0.368332,-0.577935,-0.599666,0.710966,-0.692039,-0.271806,-0.568577,0.723264,-0.124814,0.460275,-0.333589,-0.421418
5799,0.139301,0.139085,0.344605,1.396486,-0.503267,0.031337,0.710966,0.003597,-0.271806,-0.568577,0.723264,-0.124814,0.460275,-0.333589,-0.421418
5800,-0.846896,-0.846854,0.229227,0.257180,-1.046310,-1.215864,-1.537005,-1.387675,-1.453094,-0.568577,-1.150120,-0.124814,-1.309450,-0.333589,-0.421418


In [19]:
# Now that we normalized our data lets say we want to send it to the next stage of our pipeline then we need to do
# a train test split. 

Train, Test = train_test_split(norm_df, test_size=0.3, random_state=66)
print('Train Set   :', Train.shape)
print('Test Set :', Test.shape)



Train Set   : (4061, 15)
Test Set : (1741, 15)


### Okay that concludes this little mini exercise in this data preprocessing. 