## Plan of attack:
1. Clean the data - take mean/median or delete null - try 2 iterations
2. Separate data into train & test sets
3. use linear regression / related model based based fit functions to check if anything makes sense
4. Levy PCA on data to reduce dimensionality
5. Perform trainings on different models from given data
6. select the best model & re-iterate for different parameters

In [1]:
import pandas as pd
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf

In [2]:
# Read df
car_excel = pd.read_csv('CarPrice_Assignment.csv')
# Replace missing values with mean values  
column_means = car_excel.mean() # TODO: solve type error
car_excel = car_excel.fillna(column_means)
car_excel.pop('car_ID') # ID Irrelevant to pricing 
car_excel.head()

  column_means = car_excel.mean() # TODO: solve type error


Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130.0,mpfi,3.47,2.68,9.0,111,5000,21.0,27,13495.0
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130.0,mpfi,3.47,2.68,9.0,111,5000,21.0,27,16500.0
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152.0,mpfi,2.68,3.47,9.0,154,5000,19.0,26,16500.0
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109.0,mpfi,3.19,3.4,10.0,102,5500,24.0,30,13950.0
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136.0,mpfi,3.19,3.4,8.0,115,5500,18.0,22,17450.0


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(car_excel, test_size=0.15)
train.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
94,1,nissan leaf,gas,std,two,sedan,fwd,front,94.5,165.3,...,97.0,2bbl,3.15,3.29,9.4,69,5200,31.0,37,7299.0
174,-1,toyota celica gt,diesel,turbo,four,sedan,fwd,front,102.4,175.6,...,110.0,idi,3.27,3.35,22.5,73,4500,30.0,33,10698.0
23,1,dodge d200,gas,turbo,two,hatchback,fwd,front,99.00102,157.3,...,98.0,mpfi,3.03,3.39,7.6,102,5500,24.994924,30,7957.0
88,-1,mitsubishi mirage g4,gas,std,four,sedan,fwd,front,96.3,172.4,...,110.0,spdi,3.17,3.46,7.5,116,5500,23.0,30,9279.0
135,2,saab 99gle,gas,std,four,sedan,fwd,front,99.1,186.6,...,121.0,mpfi,3.54,3.07,9.3,110,5250,21.0,28,15510.0


In [4]:
y_train = train.pop('price')
y_test = test.pop('price')

# y_train.head()
train.head()

Unnamed: 0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
94,1,nissan leaf,gas,std,two,sedan,fwd,front,94.5,165.3,...,four,97.0,2bbl,3.15,3.29,9.4,69,5200,31.0,37
174,-1,toyota celica gt,diesel,turbo,four,sedan,fwd,front,102.4,175.6,...,four,110.0,idi,3.27,3.35,22.5,73,4500,30.0,33
23,1,dodge d200,gas,turbo,two,hatchback,fwd,front,99.00102,157.3,...,four,98.0,mpfi,3.03,3.39,7.6,102,5500,24.994924,30
88,-1,mitsubishi mirage g4,gas,std,four,sedan,fwd,front,96.3,172.4,...,four,110.0,spdi,3.17,3.46,7.5,116,5500,23.0,30
135,2,saab 99gle,gas,std,four,sedan,fwd,front,99.1,186.6,...,four,121.0,mpfi,3.54,3.07,9.3,110,5250,21.0,28


In [6]:
# Separate train data into categorical & numerical

CATEGORICAL_COLUMNS = ['symboling','CarName','fueltype','aspiration','doornumber','carbody','drivewheel',
'enginelocation','enginetype','cylindernumber','fuelsystem']

NUMERIC_COLUMNS = ['wheelbase','fare', 'carlength','carwidth','carheight','curbweight','enginesize','boreratio','stroke','compressionratio','horsepower',
'peakrpm',	'citympg',	'highwaympg']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = train[feature_name].unique()  # gets a list of all unique values from given feature column
    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(feature_columns)

[VocabularyListCategoricalColumn(key='symboling', vocabulary_list=(1, -1, 2, 3, 0, -2), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='CarName', vocabulary_list=('nissan leaf', 'toyota celica gt', 'dodge d200', 'mitsubishi mirage g4', 'saab 99gle', 'vw dasher', 'renault 12tl', 'chevrolet monte carlo', 'jaguar xf', 'nissan note', 'dodge challenger se', 'maxda rx3', 'volkswagen dasher', 'plymouth cricket', 'volvo 145e (sw)', 'audi 100ls', 'vokswagen rabbit', 'subaru dl', 'dodge monaco (sw)', 'vw rabbit', 'alfa-romero stelvio', 'plymouth valiant', 'subaru trezia', 'nissan clipper', 'peugeot 604sl', 'volkswagen super beetle', 'volkswagen 411 (sw)', 'volvo 144ea', 'mazda rx-7 gs', 'subaru r1', 'nissan fuga', 'peugeot 505s turbo diesel', 'toyota corolla', 'volvo diesel', 'plymouth fury iii', 'toyota celica gt liftback', 'honda civic', 'mazda rx-4', 'audi 5000s (diesel)', 'mazda 626', 'chevrolet impala', 'toyouta tercel', 'honda prelude', 'mazda glc

In [7]:
def make_input_fn(data_df, label_df, num_epochs=20, shuffle=True, batch_size=32):
    def input_function():  # inner function, this will be returned
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))  # create tf.data.Dataset object with data and its label
        if shuffle:
            ds = ds.shuffle(1000)  # randomize order of data
        ds = ds.batch(batch_size).repeat(num_epochs)  # split dataset into batches of 32 and repeat process for number of epochs
        return ds  # return a batch of the dataset
    return input_function  # return a function object for use

train_input_fn = make_input_fn(train, y_train)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(test, y_test, num_epochs=1, shuffle=False)

In [8]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
# Create a linear estimtor by passing the feature columns we created earlier
linear_est.train(input_fn=train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result['accuracy'])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Batman\\AppData\\Local\\Temp\\tmpguou926r', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Use Variable.read_

  self.bias = self.add_variable(


ValueError: in user code:

    File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py", line 1668, in call  *
        return self.layer(features)
    File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py", line 1496, in call  *
        weighted_sum = fc_v2._create_weighted_sum(  # pylint: disable=protected-access

    ValueError: Feature fare is not in features dictionary.
