#### Copyright 2018 Google LLC.

In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#Intro to Modeling


**Learning Objectives:**
* Become familiar with pandas for handling small datasets
* Use the tf.Estimator and Feature Column API to experiment with feature transformations
* Use visualizations and run experiments to understand the value of feature transformations

Please **make a copy** of this Colab notebook before starting this lab. To do so, choose **File**->**Save a copy in Drive**.

## Setup

Let's start by importing our dependencies.

## Pandas, a helpful data analysis library for in-memory dataset

We use a package called [Pandas](http://pandas.pydata.org/) for reading in our data, exploring our data and doing some basic processing. It is really helpful for datasets that fit in memory! And it has some nice integrations, as you will see.

First we set up some options to control how items are displayed and the maximum number of rows to show when displaying a table.  Feel free to change this setup to whatever you'd like.

### Load the dataset with pandas
The car data set we will be using in this lab is provided as a comma separated file without a header row.  In order for each column to have a meaningful header name we must provide it.  We get the information about the columns from the [Automobile Data Set](https://archive.ics.uci.edu/ml/datasets/automobile).

We will use the features of the car, to try to predict its price.


This is a really small dataset! Only 205 examples.

For simplicity in this codelab, we do not split the data further into training and validation. But you MUST do this on real datasets, or else you will overfit to your single dataset.

## Task 0: Use pandas to explore and prepare the data

- Use Pandas to inspect the data and manually curate a list of numeric_feature_names and categorical_feature_names.




**Step 1: Loading and Inspecting the Data**

In [1]:
# Loading all the Libraries

import pandas as pd
import numpy as np

# Defining the Column as per the specified order
feature_names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
    'num-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base',
    'length', 'width', 'height', 'weight', 'engine-type', 'num-cylinders',
    'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
    'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
]

# Loading the Dataset
data = pd.read_csv('https://storage.googleapis.com/mledu-datasets/cars_data.csv',
                        sep=',', names=feature_names, header=None, encoding='latin-1')



# Assigning Column Names
data.columns = feature_names

# Displaying Basic information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-doors          205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  weight             205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-cylinders      205 non-null    object 
 16  engine-size        205 non

**Step 2: Identifying Numeric and Categorical Features**<br>

In [2]:
# Displaying the number of unique values in each column for categorical features identification
unique_values = data.nunique()
print(unique_values)

symboling              6
normalized-losses     52
make                  22
fuel-type              2
aspiration             2
num-doors              3
body-style             5
drive-wheels           3
engine-location        2
wheel-base            53
length                75
width                 44
height                49
weight               171
engine-type            7
num-cylinders          7
engine-size           44
fuel-system            8
bore                  39
stroke                37
compression-ratio     32
horsepower            60
peak-rpm              24
city-mpg              29
highway-mpg           30
price                187
dtype: int64


**Step 3: Curating Lists of Numeric and Categorical Features**

In [3]:
# List of numeric features
numeric_feature_names = ['symboling', 'wheel-base', 'length', 'width', 'height', 'weight', 'engine-size', 'compression-ratio','city-mpg', 'highway-mpg']

# List of features that are numeric but represented as object types
object_numeric_features = ['bore', 'stroke', 'horsepower', 'peak-rpm', 'price']

# List of Categorical features
categorical_feature_names = ['normalized-losses' 'make', 'fuel-type', 'aspiration', 'num-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-cylinders', 'fuel-system']

**Step 4: Handling Placeholder values**


In [4]:
import numpy as np

# Replacing '?' with NaN in columns that should be numeric
data[object_numeric_features] = data[object_numeric_features].replace('?', np.nan)

# Converting them to numeric types
for col in object_numeric_features:
    data[col] = pd.to_numeric(data[col], errors='coerce')

**Step 5: Verifying Data Types and Checking for missing values**

In [5]:
# Confirming the numeric columns are correctly typed
data.info()

# Displaying missing values in each column to prepare data for cleaning
missing_values = data.isnull().sum()
print(missing_values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-doors          205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  weight             205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-cylinders      205 non-null    object 
 16  engine-size        205 non

## Task 1: Make your best model with numeric features. No normalization allowed.

Modify the model provided below to achieve the lowest eval loss. You may want to change various hyperparameters:
- learning rate
- choice of optimizer
- hidden layer dimensions -- make sure your choice here makes sense given the number of training examples
- batch size
- num training steps
- (anything else you can think of changing)

Do not use the `normalizer_fn` arg on `numeric_column`.

**Step 1: Importing All Necessary Libraries**

In [6]:
import tensorflow as tf
from tensorflow import feature_column
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from sklearn.model_selection import train_test_split
import math
import pandas as pd
import numpy as np


**Step 2: Preparing the Data**

In [7]:
# Defining the numeric features
numeric_features = ['symboling', 'wheel-base', 'length', 'width', 'height', 'weight', 'engine-size', 'compression-ratio','city-mpg', 'highway-mpg',
                    'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
# Dropping rows with missing values in the numeric features for simplicity
data = data.dropna(subset=numeric_features)

# Splitting data into features and labels (Assuming 'price' as the label)
X = data[numeric_features].drop(columns=['price'])
y = data['price']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Step 3: Creating Numeric Feature Columns**

In [8]:

# Creating Tensorflow numeric feature columns
feature_columns = [feature_column.numeric_column(key=col) for col in X.columns]

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


**Step 4: Building and Compiling the Model**

In [9]:
# Defining a model function that allows for hyperparameter tuning
from tensorflow.keras import layers, optimizers, Model
tf.config.run_functions_eagerly(True)

def build_model(learning_rate=0.001, optimizer='adam', hidden_units=[64, 32], dropout_rate=0.2):

  input_layers = {
      col.key: keras.Input(shape=(1,), name=col.key)  # Assuming numeric columns with shape (1,)
      for col in feature_columns
  }

  feature_layer = layers.concatenate([input_layers[col.key] for col in feature_columns])


  #Building the model

  model = tf.keras.Sequential()
  model.add(layers.Dense(hidden_units[0], activation='relu', input_shape=(feature_layer.shape[1],)))

  x=layers.Dense(hidden_units[1], activation='relu')(feature_layer)
  for units in hidden_units:
    model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dropout(dropout_rate))
  model.add(layers.Dense(1)) #Output layer

  #Choosing optimizer
  if optimizer == 'adam':
    opt = Adam(learning_rate=learning_rate)
  elif optimizer == 'sgd':
    opt = SGD(learning_rate=learning_rate)
  elif optimizer == 'rmsprop':
    opt = RMSprop(learning_rate=learning_rate)
  else:
    opt = Adam(learning_rate=learning_rate) #Default

    # Compiling the model
  model.compile(optimizer=opt, loss='mse', metrics=['mae'])
  return model



**Step 5: Training the Model with Different Hyperparameters**

In [10]:
# Changed Hyperparameters
learning_rate = 0.001
optimizer = 'adam'
hidden_units = [64, 32]
batch_size = 32
epochs = 100
# Building and Training the model
model = build_model(learning_rate=learning_rate, optimizer=optimizer, hidden_units=hidden_units)

history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 85ms/step - loss: 201180144.0000 - mae: 12500.9307 - val_loss: 191383712.0000 - val_mae: 11873.8633
Epoch 2/100




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 171732848.0000 - mae: 11305.9004 - val_loss: 162507328.0000 - val_mae: 10623.3066
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - loss: 145873840.0000 - mae: 9969.8555 - val_loss: 137332432.0000 - val_mae: 9413.9824
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 126335384.0000 - mae: 8897.0254 - val_loss: 113681024.0000 - val_mae: 8105.4351
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 122342584.0000 - mae: 8123.4731 - val_loss: 91135856.0000 - val_mae: 6684.9258
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 86623856.0000 - mae: 6616.4521 - val_loss: 72116000.0000 - val_mae: 5510.7358
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - loss: 62371976.0000 - mae: 5506.7485 - val_loss: 57366656.0000 - va

**Step 6: Evaluating the Model**

In [11]:
# Evaluating the model on the test set
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Mae: {mae}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 48846868.0000 - mae: 4346.7129
Test Loss: 45746128.0, Test Mae: 4135.638671875




**Step 7: Getting the best model**

In [12]:
learning_rates = [0.001, 0.01, 0.0001]
optimizers = ['adam', 'sgd', 'rmsprop']
hidden_layers_configs = [[64, 32], [128, 64, 32], [256, 128, 64]]
batch_sizes = [16, 32, 64]
epochs = 50

# Track the best configuration
best_loss = float('inf')
best_config = None

# Experiment with all configurations
for lr in learning_rates:
    for opt in optimizers:
        for hidden_layers in hidden_layers_configs:
            for batch_size in batch_sizes:
                # Build and train the model
                model = build_model(learning_rate=lr, optimizer=opt, hidden_units=hidden_layers)
                history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)

                # Evaluate on test set
                loss, mae = model.evaluate(X_test, y_test, verbose=0)

                # Track the best model based on the lowest loss
                if loss < best_loss:
                    best_loss = loss
                    best_config = {
                        'learning_rate': lr,
                        'optimizer': opt,
                        'hidden_layers': hidden_layers,
                        'batch_size': batch_size,
                        'epochs': epochs,
                        'test_loss': loss,
                        'test_mae': mae
                    }

# Display the best configuration and its performance
best_config

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

{'learning_rate': 0.01,
 'optimizer': 'rmsprop',
 'hidden_layers': [128, 64, 32],
 'batch_size': 16,
 'epochs': 50,
 'test_loss': 33393382.0,
 'test_mae': 4757.13818359375}

### Visualize your model's predictions

After you have a trained model, it may be helpful to understand how your model's inference differs from the actual data.

This helper function `scatter_plot_inference` does that for you. Real data is in grey. Your model's predictions are in orange.


In [None]:
from matplotlib import pyplot as plt


def scatter_plot_inference_grid(est, x_df, feature_names):
  """Plots the predictions of the model against each feature.

  Args:
    est: The trained tf.Estimator.
    x_df: The pandas dataframe with the input data (used to create
      predict_input_fn).
    feature_names: An iterable of string feature names to plot.
  """
  def scatter_plot_inference(axis,
                             x_axis_feature_name,
                             y_axis_feature_name,
                             predictions):
    """Generate one subplot."""
    # Plot the real data in grey.
    y_axis_feature_name = 'price'
    axis.set_ylabel(y_axis_feature_name)
    axis.set_xlabel(x_axis_feature_name)
    axis.scatter(car_data[x_axis_feature_name],
                 car_data[y_axis_feature_name],
                 c='grey')

    # Plot the predicted data in orange.
    axis.scatter(car_data[x_axis_feature_name], predictions, c='orange')

  predict_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=x_df,
    batch_size=batch_size,
    shuffle=False)

  predictions = [
    x['predictions'][0]
    for x in est.predict(predict_input_fn)
  ]

  num_cols = 3
  num_rows = int(math.ceil(len(feature_names)/float(num_cols)))
  f, axarr = plt.subplots(num_rows, num_cols)
  size = 4.5
  f.set_size_inches(num_cols*size, num_rows*size)

  for i, feature_name in enumerate(numeric_feature_names):
    axis = axarr[int(i/num_cols), i%num_cols]
    scatter_plot_inference(axis, feature_name, 'price', predictions)
  plt.show()

scatter_plot_inference_grid(est, x_df, numeric_feature_names)

## Task 2: Take your best numeric model from earlier. Add normalization.

### Add normalization to your best numeric model from earlier

- You decide what type of normalization to add, and for which features
- You will need to use the `normalizer_fn` arg on [`numeric_column`](https://g3doc.corp.google.com/learning/brain/public/g3doc/api_docs/python/tf/feature_column/numeric_column.md?cl=head)
    - An example of a silly normalizer_fn that shifts inputs down by 1, and then negates the value:
    
         normalizer_fn = lambda x: tf.neg(tf.subtract(x, 1))

- You may find these pandas functions helpful:
    - dataframe.mean()['your_feature_name']
    - dataframe.std()['your_feature_name']
- You will need to retune the hyperparameters from earlier.


**Does normalization improve model quality on this dataset? Why or why not?**

**Step 1:Calculating Mean and Standard Deviation to Each Numeric Feature**

In [13]:
# Calculating mean and standard deviation
feature_means = X_train.mean()
feature_stds = X_train.std()

**Step 2: Defining Normalized Numeric Columns**

In [14]:
# Creating normalized numeric columns using mean and standard deviation
def create_normalized_numeric_column(feature_name):
  mean = feature_means[feature_name]
  std = feature_stds[feature_name]
  return feature_column.numeric_column(
      key=feature_name,
      normalizer_fn=lambda x: (x - mean) / std
  )

# Applying normalization to individual numeric column
normalized_feature_columns = [create_normalized_numeric_column(col) for col in X.columns]

**Step 3: Building the Model with Normalized Features**

In [15]:
# Building normalized model
def build_normalized_model(learning_rate, optimizer, hidden_units, dropout_rate=0.2):
  input_layers = {
      col.key: keras.Input(shape=(1,), name=col.key)  # Assuming numeric columns with shape (1,)
      for col in feature_columns
  }

  feature_layer = layers.concatenate([input_layers[col.key] for col in feature_columns])
  #Building the model
  model = tf.keras.Sequential()
  model.add(layers.Dense(hidden_units[0], activation='relu', input_shape=(feature_layer.shape[1],)))
  for units in hidden_units:
    model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dropout(dropout_rate))
  model.add(layers.Dense(1)) #Output layer

  #Choosing optimizer
  if optimizer == 'adam':
    opt = Adam(learning_rate=learning_rate)
  elif optimizer == 'sgd':
    opt = SGD(learning_rate=learning_rate)
  elif optimizer == 'rmsprop':
    opt = RMSprop(learning_rate=learning_rate)
  else:
    opt = Adam(learning_rate=learning_rate) #Default

    # Compiling the model
  model.compile(optimizer=opt, loss='mse', metrics=['mae'])
  return model


**Step 4: Retuning With Best Numeric Model HyperParameter and Evaluating Models Performance**


In [16]:
learning_rate = 0.01
optimizer = 'adam'
hidden_layers =[256, 128, 64]
batch_size = 16
epochs = 50

# Building and Training the model
model = build_normalized_model(learning_rate=learning_rate, optimizer=optimizer, hidden_units=hidden_layers)
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=1)

# Evaluating the model with normalized features
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss with Normalization: {loss}, Test MAE with Normalization: {mae}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step - loss: 109151696.0000 - mae: 8510.4863 - val_loss: 43905196.0000 - val_mae: 5470.3701
Epoch 2/50




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - loss: 44888392.0000 - mae: 5137.2051 - val_loss: 41193808.0000 - val_mae: 5367.8584
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - loss: 31029988.0000 - mae: 4318.6162 - val_loss: 32485040.0000 - val_mae: 4353.0488
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 83ms/step - loss: 30673604.0000 - mae: 3914.7625 - val_loss: 32510098.0000 - val_mae: 3391.1411
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 86ms/step - loss: 17052636.0000 - mae: 3021.9109 - val_loss: 31541732.0000 - val_mae: 3539.3311
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 83ms/step - loss: 21154184.0000 - mae: 3378.6064 - val_loss: 32798192.0000 - val_mae: 3266.4451
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step - loss: 21718452.0000 - mae: 3193.9246 - val_loss: 36339492.0000 - val_mae: 4169.9



**Analysis:**<br>
Based on the model's performance, it can be drawn out that, normalization can only improve the quality of model if feature scale varies widely or data contains any outliers. However, if the data is already scaled in the same way, there could be minimal effect made by normalization

In [None]:
# This 1D visualization of each numeric feature might inform your normalization
# decisions.
for feature_name in numeric_feature_names:
  car_data.hist(column=feature_name)

## Task 3: Make your best model using only categorical features

- Look at the possible feature columns for categorical features. They begin with `categorical_column_with_` in go/tf-ops.
- You may find `dataframe[categorical_feature_names].unique()` helpful.


**Step 1: Identifying Categorical Features**

In [17]:
categorical_feature_names = ['normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-cylinders', 'fuel-system']

# Checking for unique values
for feature in categorical_feature_names:
  print(f"{feature}: {data[feature].unique()}")

normalized-losses: ['?' '164' '158' '192' '188' '121' '98' '81' '118' '148' '110' '145' '137'
 '101' '78' '106' '85' '107' '104' '113' '129' '115' '93' '142' '161'
 '153' '125' '128' '122' '103' '168' '108' '194' '231' '119' '154' '74'
 '186' '150' '83' '102' '89' '87' '77' '91' '134' '65' '197' '90' '94'
 '256' '95']
make: ['alfa-romero' 'audi' 'bmw' 'chevrolet' 'dodge' 'honda' 'isuzu' 'jaguar'
 'mazda' 'mercedes-benz' 'mercury' 'mitsubishi' 'nissan' 'peugot'
 'plymouth' 'porsche' 'saab' 'subaru' 'toyota' 'volkswagen' 'volvo']
fuel-type: ['gas' 'diesel']
aspiration: ['std' 'turbo']
num-doors: ['two' 'four' '?']
body-style: ['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
drive-wheels: ['rwd' 'fwd' '4wd']
engine-location: ['front' 'rear']
engine-type: ['dohc' 'ohcv' 'ohc' 'l' 'ohcf']
num-cylinders: ['four' 'six' 'five' 'three' 'twelve' 'eight']
fuel-system: ['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' 'idi' 'spdi']


**Step 2: Creating Categorical Feature Columns

In [1]:
from tensorflow.feature_column import categorical_column_with_vocabulary_list, embedding_column, indicator_column

categorical_columns = []

# defining columns with one-hot encoding
for feature in categorical_feature_names:
  unique_values = data[feature].unique()
  cat_column = categorical_column_with_vocabulary_list(key=feature, vocabulary_list=unique_values)

  # Using embedding for features with multiple unique values
  if len(unique_values) > 10:
    cat_column = embedding_column(cat_column, dimension=min(len(unique_values)//2,8)) #size of dimension based on unique value
  else:
    cat_column = indicator_column(cat_column) #single hot encoding for lesser categories

  categorical_columns.append(cat_column)


NameError: name 'categorical_feature_names' is not defined

**Step 3: Building the Model with Categorical Features**

In [55]:
!pip install --upgrade tensorflow


Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow)
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m869.6 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras-3.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, keras, tensorflow
  

In [52]:
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam, SGD, RMSprop

def build_categorical_model(learning_rate, optimizer_name, hidden_units, dropout_rate=0.2):

  feature_layer = tf.keras.layers.DenseFeatures(categorical_columns)
  model = tf.keras.Sequential([feature_layer])

  for units in hidden_units:
    model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dropout(dropout_rate))


    model.add(layers.Dense(1))




    # Choose the optimizer
  if optimizer_name == 'adam':
      optimizer = Adam(learning_rate=learning_rate)
  elif optimizer_name == 'sgd':
      optimizer = SGD(learning_rate=learning_rate)
  elif optimizer_name == 'rmsprop':
      optimizer = RMSprop(learning_rate=learning_rate)
  else:
      raise ValueError(f"Unsupported optimizer name")

  model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
  return model

Step 4: Training the model with Categorical Features Only


In [54]:
# Hypermeters from best numeric model, and making some adjustments

learning_rate = 0.01
optimizer = 'adam'
hidden_layers = [256, 128, 64]
batch_size = 16
epochs = 50

# Building and Training the model
model = build_categorical_model(learning_rate=learning_rate, optimizer_name=optimizer, hidden_units=hidden_layers)
history = model.fit(X_train[categorical_feature_names], y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1)

AttributeError: module 'keras._tf_keras.keras.layers' has no attribute 'DenseFeatures'

## Task 4: Using all the features, make the best model that you can make

With all the features combined, your model should perform better than your earlier models using numerical and categorical models alone. Tune your model until that is the case.

In [None]:
## Your code goes here