In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight','Acceleration','Model Year','Origin']

df = pd.read_csv('../input/autompgdata/auto-mpg.data',names=cols,na_values="?",comment='\t',sep=" ",skipinitialspace=True)

data=df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
data = strat_train_set.drop('MPG',axis=1)
data_labels = strat_train_set['MPG'].copy()

In [None]:
def preprocess_origin_cols(df):
    df['Origin'] = df['Origin'].map({1: "India", 2: "USA",3: "Germany"})
    return df
data_tr = preprocess_origin_cols(data)
data_tr.head()

In [None]:
data_cat = data_tr[["Origin"]]
data_cat.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

In [None]:
data_cat_1hot.toarray()[:5]

In [None]:
num_data = data.iloc[:,:-1]
num_data.info()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(num_data)

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(num_data)
X

In [None]:
data_tr = pd.DataFrame(X, columns=num_data.columns,index=num_data.index)
data_tr.info()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4,2,0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:,acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power,acc_on_cyl]
        return np.c_[X, acc_on_cyl]

attr_adder = CustomAttrAdder(acc_on_power=True)
data_tr_extra_attrs = attr_adder.transform(data_tr.values)
data_tr_extra_attrs[0]

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

numerics = ['float64','int64']

num_data = data_tr.select_dtypes(include=numerics)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attrs_adder', CustomAttrAdder()),
    ('std_scalar', StandardScaler()),
])

num_data_tr = num_pipeline.fit_transform(data_tr)
num_data_tr[0]

In [None]:
from sklearn.compose import ColumnTransformer

num_attrs = list(num_data)
cat_attrs = ["Origin"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attrs),
    ("cat", OneHotEncoder(), cat_attrs),
])

prepared_data = full_pipeline.fit_transform(data)
prepared_data