In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
import os 
os.listdir()

['.ipynb_checkpoints',
 'Auto MPG - part 1.ipynb',
 'auto-mpg.data',
 'Data pipeline - Part 2.ipynb',
 'data_preparation_part1']

In [3]:
col =['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin'] 

df = pd.read_csv('auto-mpg.data', names = col,comment = '\t', sep = ' ', skipinitialspace = True, na_values= '?')
data = df.copy()

# Data Preparation using Sklearn

In [4]:
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


# Stratified Sampling — We create homogeneous subgroups called strata from the overall population and sample the right number of instances to each stratum to ensure that the test set is representative of the overall population.

In [5]:
strata = StratifiedShuffleSplit(n_splits= 1, test_size = 0.2, random_state=42)
strata.split(data, data.Cylinders)

<generator object BaseShuffleSplit.split at 0x0000026D90C49C80>

In [6]:
for train_index, test_index in strata.split(data, data.Cylinders) :
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

# Seperating features and target variables

In [7]:
data_features = train_set.drop(columns = 'MPG', axis = 1)
data_target = train_set[['MPG']]

# 1. Method for pre_processing origin column in data

In [37]:
def preprocess_origin_col(df) :
    df['Origin'] = df['Origin'].map({1 : 'India', 2 : 'USA', 3 : "Germany"})
    return df
data_tr = preprocess_origin_col(data_features)  
data_tr

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,USA
156,8,400.0,170.0,4668.0,11.5,75,India
395,4,135.0,84.0,2295.0,11.6,82,India
14,4,113.0,95.0,2372.0,15.0,70,Germany


# 2. OneHotEncoding on Origin Column

In [61]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit_transform(data_tr[['Origin']])

<318x3 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [63]:
data_tr.isnull().sum()

Cylinders       0
Displacement    0
Horsepower      4
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

# 3. Handelling mising values with SimpleImputer

In [41]:
num_data = data_features.drop('Origin', axis = 1)
num_data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year
145,4,83.0,61.0,2003.0,19.0,74
151,4,79.0,67.0,2000.0,16.0,74
388,4,156.0,92.0,2585.0,14.5,82
48,6,250.0,88.0,3139.0,14.5,71
114,4,98.0,90.0,2265.0,15.5,73
...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74
156,8,400.0,170.0,4668.0,11.5,75
395,4,135.0,84.0,2295.0,11.6,82
14,4,113.0,95.0,2372.0,15.0,70


In [42]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(num_data)

SimpleImputer(strategy='median')

In [43]:
#median of all columns
imputer.statistics_ 

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [44]:
##imputing the missing values by transforming the dataframe
num_data_tr = imputer.transform(num_data)

# 4. Creating custom attribute added class

In order to make changes to datasets and create new variables, sklearn offers the BaseEstimator class. Using it, we can develop new features by defining our own class.

We have created a class to add two new features as found in the EDA step above:

acc_on_power — Acceleration divided by Horsepower
acc_on_cyl — Acceleration divided by the number of Cylinders

In [20]:
num_data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


In [45]:
from sklearn.base import BaseEstimator, TransformerMixin
acc_i = 4
cyl_i = 0
hp_i =  2

class CustomAttrAdder(BaseEstimator, TransformerMixin) :
    def __init__(self, acc_on_power = True) :
        self.acc_on_power = acc_on_power
    def fit(self, num_data_tr, y = None) :
        return self #nothing else to do
    def transform(self,num_data_tr) :
        acc_on_cyl = num_data_tr[: , acc_i]/num_data_tr[: , cyl_i]
        if self.acc_on_power :
            acc_on_power = num_data_tr[:, acc_i]/num_data_tr[:, hp_i]
            return np.c_[num_data_tr,acc_on_power,acc_on_cyl]
        return np.c_[num_data_tr,acc_on_cyl]
    
attr_addr = CustomAttrAdder(acc_on_power= True)
num_data_extra_attr = attr_addr.transform(num_data_tr)

In [46]:
num_data_extra_attr[0]

array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

# 5.scaling the data using standardscalar

In [47]:
#scaling the data using standardscalar
from sklearn.preprocessing import StandardScaler  
scalar = StandardScaler()
scalar.fit_transform(num_data_extra_attr)

array([[-0.85657842, -1.07804475, -1.15192977, ..., -0.54436373,
         1.70952741,  1.29565517],
       [-0.85657842, -1.1174582 , -0.9900351 , ..., -0.54436373,
         0.79867454,  0.666186  ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  1.63652025,
        -0.21906787,  0.35145142],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  1.63652025,
        -0.46365334, -0.25703544],
       [-0.85657842, -0.78244384, -0.23452666, ..., -1.63480572,
        -0.21548258,  0.45636295],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.36390976,
        -0.75313354, -0.76061078]])

# Creating pipeline of task

In [51]:
#using pipeline class
from sklearn.pipeline import Pipeline

#pipeline for numerical attributes
#imputing >> attribute adding >> scaling

num_pipeline = Pipeline([
         ('imputer', SimpleImputer(strategy='median')),
         ('attr',CustomAttrAdder()),
         ('scale', StandardScaler())
         ])

In [54]:
num_data_tr = num_pipeline.fit_transform(num_data)
num_data_tr[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

# Transforming Numerical and Categorical attributes

In [65]:
#Transforming different columns using columntransformer
from sklearn.compose import ColumnTransformer

num_col = list(num_data) #gives list of column names
cat_col = ['Origin']

#complete pipeline for both Numerical and Categorical attributes
final_pipeline = ColumnTransformer([
        ('num',num_pipeline, num_col),
        ('cat', OneHotEncoder(), cat_col)
        ])

prepared_data = final_pipeline.fit_transform(data_features)

In [67]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

In [68]:
import pickle
pickle.dump(prepared_data, open('feature_data_array.pkl','wb'))

In [69]:
os.listdir()

['.ipynb_checkpoints',
 'Auto MPG - part 1.ipynb',
 'auto-mpg.data',
 'Data pipeline - Part 2.ipynb',
 'data_preparation_part1',
 'feature_data_array.pkl']