In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
# load the data
dataset_path = tf.keras.utils.get_file(
    'auto-mpg.data',('http://archive.ics.uci.edu/ml/machine-learning'
                    '-databases/auto-mpg/auto-mpg.data'))

In [3]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'ModelYear','Origin']

In [4]:
df = pd.read_csv(dataset_path, names=column_names, na_values='?',
                comment='\t',sep=' ',skipinitialspace=True)

In [5]:
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [6]:
df.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
ModelYear       0
Origin          0
dtype: int64

In [7]:
df = df.dropna()
df = df.reset_index(drop=True)

In [8]:
import sklearn
import sklearn.model_selection

In [9]:
df_train, df_test = sklearn.model_selection.train_test_split(
df, train_size=0.8)

In [10]:
train_stats = df_train.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.514058,7.786798,9.0,17.0,23.0,29.0,44.6
Cylinders,313.0,5.42492,1.677993,3.0,4.0,4.0,6.0,8.0
Displacement,313.0,192.253994,103.777005,68.0,104.0,146.0,258.0,455.0
Horsepower,313.0,104.140575,38.280837,46.0,75.0,92.0,125.0,225.0
Weight,313.0,2973.067093,852.806587,1613.0,2220.0,2795.0,3613.0,4997.0
Acceleration,313.0,15.583706,2.733826,8.0,13.9,15.5,17.2,24.8
ModelYear,313.0,75.907348,3.735791,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.578275,0.801306,1.0,1.0,1.0,2.0,3.0


In [11]:
numeric_column_names = ['Cylinders','Displacement','Horsepower','Weight',
                       'Acceleration']
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

In [14]:
for col_name in numeric_column_names:
    mean = train_stats.loc[col_name,'mean']
    std =  train_stats.loc[col_name, 'std']
    df_train_norm.loc[:,col_name]=(df_train_norm.loc[:,col_name]-mean)/std
    df_test_norm.loc[:,col_name]= (df_test_norm.loc[:, col_name]-mean)/std
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,ModelYear,Origin
370,37.0,-3.739052,-1.86197,-2.745098,-3.487519,-5.350267,82,3
96,18.0,-3.028739,-1.849528,-2.71985,-3.486012,-5.577728,73,1
150,19.0,-3.028739,-1.849528,-2.726674,-3.485815,-5.644628,75,1
250,19.2,-3.028739,-1.848971,-2.71985,-3.485442,-5.216466,78,1
113,26.0,-3.739052,-1.86132,-2.730086,-3.487189,-5.711529,73,2


In [15]:
numeric_features = []
for col_names in numeric_column_names:
    numeric_features.append(tf.feature_column.numeric_column(key=col_name))

In [16]:
numeric_features

[NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='Acceleration', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [17]:
df.ModelYear.value_counts()

73    40
78    36
76    34
75    30
82    30
70    29
79    29
72    28
77    28
81    28
71    27
80    27
74    26
Name: ModelYear, dtype: int64

In [18]:
feature_year = tf.feature_column.numeric_column(key='ModelYear')
bucketized_features=[]
bucketized_features.append(
    tf.feature_column.bucketized_column(
        source_column=feature_year, boundaries=[73,76,79]))

In [19]:
df.Origin.value_counts()

1    245
3     79
2     68
Name: Origin, dtype: int64

In [21]:
bucketized_features

[BucketizedColumn(source_column=NumericColumn(key='ModelYear', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(73, 76, 79))]

In [None]:
tf.cast()