# Load Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
housing = pd.read_csv('housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [3]:
housing.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y = None):
        return self # do nothing
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if(self.add_bedrooms_per_room):
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [5]:
num_pipeline = Pipeline([
                            ('imputer', SimpleImputer(strategy = 'median')),
                            ('attribs_adder', CombinedAttributesAdder()),
                            ('std_scaler', StandardScaler())
    
                        ])

In [6]:
num_pipeline

In [7]:
myArray = num_pipeline.fit_transform(housing.drop(columns= 'ocean_proximity'))

In [8]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [9]:
housing.shape

(20640, 10)

In [10]:
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [11]:
df2 = pd.DataFrame(myArray, columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'room per household', 'population per household', 'bedrooms per household'])

In [12]:
df2.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,room per household,population per household,bedrooms per household
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,0.628559,-0.049597,-1.029988
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,0.327041,-0.092512,-0.888897
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,1.15562,-0.025843,-1.291686
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.050329,-0.449613
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.085616,-0.639087


In [13]:
df2.shape

(20640, 12)

# Column Transformer

In [14]:
housing = pd.read_csv('housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [15]:
# Objectives

#    1. use simple imputer to impute with strategy = median
#    2. add new attributes
#    3. standard scaler on numeric attributes
#    4. one hot encoding on "ocean proximity"
#    5. do not do anything to lattitude and longitude

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

num_attribs = ['housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']
cat_attribs = ['ocean_proximity']
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ("donottouch", 'passthrough', ['longitude','latitude'])
])

In [17]:
full_pipeline

In [18]:
housing_prepared = full_pipeline.fit_transform(housing)

In [19]:
housing_prepared

array([[ 9.82142658e-01, -8.04819097e-01, -9.72476479e-01, ...,
         0.00000000e+00, -1.22230000e+02,  3.78800000e+01],
       [-6.07018913e-01,  2.04589010e+00,  1.35714343e+00, ...,
         0.00000000e+00, -1.22220000e+02,  3.78600000e+01],
       [ 1.85618152e+00, -5.35745886e-01, -8.27024264e-01, ...,
         0.00000000e+00, -1.22240000e+02,  3.78500000e+01],
       ...,
       [-9.24851228e-01, -1.74995261e-01, -1.23607812e-01, ...,
         0.00000000e+00, -1.21220000e+02,  3.94300000e+01],
       [-8.45393149e-01, -3.55599767e-01, -3.04826966e-01, ...,
         0.00000000e+00, -1.21320000e+02,  3.94300000e+01],
       [-1.00430931e+00,  6.84082740e-02,  1.88756782e-01, ...,
         0.00000000e+00, -1.21240000e+02,  3.93700000e+01]])

In [20]:
df3 = pd.DataFrame(housing_prepared)
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,-0.870021,-0.323701,0.289736,0.0,0.0,0.0,1.0,0.0,-122.23,37.88
1,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,-0.260584,0.191818,1.17516,0.0,0.0,0.0,1.0,0.0,-122.22,37.86
2,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,-0.799033,-0.08362,-0.079253,0.0,0.0,0.0,1.0,0.0,-122.24,37.85
3,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,-0.776,-0.525073,0.302255,0.0,0.0,0.0,1.0,0.0,-122.25,37.85
4,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,-0.774355,-1.098269,1.008474,0.0,0.0,0.0,1.0,0.0,-122.25,37.85


In [21]:
df3.shape

(20640, 17)

In [22]:
housing['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [23]:
df3 = pd.DataFrame(housing_prepared, columns = ['housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'room per household', 'population per household', 'bedrooms per household', '<1HOcean',
                                               'Inland', 'Near Ocean', 'Near Bay', 'Island', 'longitude', 'latitude'])
df3.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,room per household,population per household,bedrooms per household,<1HOcean,Inland,Near Ocean,Near Bay,Island,longitude,latitude
0,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,-0.870021,-0.323701,0.289736,0.0,0.0,0.0,1.0,0.0,-122.23,37.88
1,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,-0.260584,0.191818,1.17516,0.0,0.0,0.0,1.0,0.0,-122.22,37.86
2,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,-0.799033,-0.08362,-0.079253,0.0,0.0,0.0,1.0,0.0,-122.24,37.85
3,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,-0.776,-0.525073,0.302255,0.0,0.0,0.0,1.0,0.0,-122.25,37.85
4,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,-0.774355,-1.098269,1.008474,0.0,0.0,0.0,1.0,0.0,-122.25,37.85


# Toy Dataset

In [24]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie', 'Dave', 'Eve', 'Frank', 'Grace', 'Henry', 'Ivy', 'John', 'Kate', 'Liam', 'Mia', 'Noah', 'Olivia'],
        'Age': [24, 32, 45, 18, 27, 39, 52, 30, 20, 42, 29, 25, 33, 37, 21],
        'Gender': ['F', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
        'City': ['New York', 'San Francisco', 'London', 'Paris', 'Sydney', 'Tokyo', 'New York', 'San Francisco', 'London', 'Paris', 'Sydney', 'Tokyo', 'New York', 'San Francisco', 'London'],
        'Salary': [50000, 70000, 90000, 30000, 55000, 75000, 95000, 40000, 60000, 80000, 100000, 85000, 65000, 45000, 85000],
        'Marital Status': ['Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Single', 'Married', 'Single', 'Single', 'Single', 'Single', 'Married']}

df = pd.DataFrame(data)

df


Unnamed: 0,Name,Age,Gender,City,Salary,Marital Status
0,Alice,24,F,New York,50000,Married
1,Bob,32,M,San Francisco,70000,Single
2,Charlie,45,M,London,90000,Married
3,Dave,18,M,Paris,30000,Single
4,Eve,27,F,Sydney,55000,Married
5,Frank,39,M,Tokyo,75000,Single
6,Grace,52,F,New York,95000,Married
7,Henry,30,M,San Francisco,40000,Single
8,Ivy,20,F,London,60000,Single
9,John,42,M,Paris,80000,Married


In [25]:
# We want to do the following data pre-processing

# 1. Drop the "Name" Column
# 2. Apply Min Max Scaling on "Age" and "Salary"
# 3. Apply One Hot Encoding on Marital Status
# 4. Leave the "Gender" column as it is

# You should first split the dataset into train and test set, apply the fit and transform on the train set 
# and tranform() on the test set

In [26]:
#Let's do train-test split
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [27]:
X_train

Unnamed: 0,Name,Age,Gender,City,Salary,Marital Status
13,Noah,37,M,San Francisco,45000,Single
5,Frank,39,M,Tokyo,75000,Single
8,Ivy,20,F,London,60000,Single
2,Charlie,45,M,London,90000,Married
1,Bob,32,M,San Francisco,70000,Single
14,Olivia,21,F,London,85000,Married
4,Eve,27,F,Sydney,55000,Married
7,Henry,30,M,San Francisco,40000,Single
10,Kate,29,F,Sydney,100000,Single
12,Mia,33,F,New York,65000,Single


In [28]:
X_train['Marital Status'].value_counts()

Single     8
Married    4
Name: Marital Status, dtype: int64

In [29]:
X_test

Unnamed: 0,Name,Age,Gender,City,Salary,Marital Status
9,John,42,M,Paris,80000,Married
11,Liam,25,M,Tokyo,85000,Single
0,Alice,24,F,New York,50000,Married


In [30]:
# Let's create the pipeline
from sklearn.preprocessing import MinMaxScaler
full_pipeline = ColumnTransformer([
    ("dropName", "drop", ['Name']),
    ("mms", MinMaxScaler(), ['Age', 'Salary']),
    ("onehot", OneHotEncoder(), ['Marital Status']),
    ("donottouch", 'passthrough', ['Gender']),
])

In [31]:
full_pipeline

In [32]:
train_set = full_pipeline.fit_transform(X_train)
test_set = full_pipeline.transform(X_test)

In [33]:
train_set

array([[0.5588235294117646, 0.21428571428571425, 0.0, 1.0, 'M'],
       [0.6176470588235293, 0.6428571428571428, 0.0, 1.0, 'M'],
       [0.05882352941176472, 0.42857142857142855, 0.0, 1.0, 'F'],
       [0.7941176470588235, 0.857142857142857, 1.0, 0.0, 'M'],
       [0.4117647058823529, 0.5714285714285714, 0.0, 1.0, 'M'],
       [0.08823529411764708, 0.7857142857142856, 1.0, 0.0, 'F'],
       [0.2647058823529411, 0.35714285714285715, 1.0, 0.0, 'F'],
       [0.3529411764705882, 0.14285714285714285, 0.0, 1.0, 'M'],
       [0.32352941176470584, 1.0, 0.0, 1.0, 'F'],
       [0.4411764705882353, 0.5, 0.0, 1.0, 'F'],
       [0.0, 0.0, 0.0, 1.0, 'M'],
       [0.9999999999999999, 0.9285714285714286, 1.0, 0.0, 'F']],
      dtype=object)

In [34]:
train_df = pd.DataFrame(train_set)
train_df.head()

Unnamed: 0,0,1,2,3,4
0,0.558824,0.214286,0.0,1.0,M
1,0.617647,0.642857,0.0,1.0,M
2,0.058824,0.428571,0.0,1.0,F
3,0.794118,0.857143,1.0,0.0,M
4,0.411765,0.571429,0.0,1.0,M


In [35]:
train_df = pd.DataFrame(train_set, columns = ['Age', 'Salary', 'Married', 'Single', 'Gender'])
train_df


Unnamed: 0,Age,Salary,Married,Single,Gender
0,0.558824,0.214286,0.0,1.0,M
1,0.617647,0.642857,0.0,1.0,M
2,0.058824,0.428571,0.0,1.0,F
3,0.794118,0.857143,1.0,0.0,M
4,0.411765,0.571429,0.0,1.0,M
5,0.088235,0.785714,1.0,0.0,F
6,0.264706,0.357143,1.0,0.0,F
7,0.352941,0.142857,0.0,1.0,M
8,0.323529,1.0,0.0,1.0,F
9,0.441176,0.5,0.0,1.0,F


In [36]:
test_df = pd.DataFrame(test_set, columns = ['Age', 'Salary', 'Married', 'Single', 'Gender'])
test_df


Unnamed: 0,Age,Salary,Married,Single,Gender
0,0.705882,0.714286,1.0,0.0,M
1,0.205882,0.785714,0.0,1.0,M
2,0.176471,0.285714,1.0,0.0,F


In [37]:
# The first row of the train dataset
X_train.head(1)

Unnamed: 0,Name,Age,Gender,City,Salary,Marital Status
13,Noah,37,M,San Francisco,45000,Single


In [38]:
#Let's figure out the scaled age of the first row
(37 - X_train['Age'].min())/(X_train['Age'].max() - X_train['Age'].min())

0.5588235294117647

In [39]:
# Let's verify
train_df.head(1)

Unnamed: 0,Age,Salary,Married,Single,Gender
0,0.558824,0.214286,0.0,1.0,M


In [40]:
# The first row of the test set
X_test.head(1)

Unnamed: 0,Name,Age,Gender,City,Salary,Marital Status
9,John,42,M,Paris,80000,Married


In [41]:
#Let's figure out the scaled age of the first row of the test set
(42 - X_train['Age'].min())/(X_train['Age'].max() - X_train['Age'].min())

0.7058823529411765

In [42]:
# Let's verify
test_df.head(1)

Unnamed: 0,Age,Salary,Married,Single,Gender
0,0.705882,0.714286,1.0,0.0,M
