<a href="https://colab.research.google.com/github/XavierCarrera/neural-network/blob/main/Used_Car_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Exploration

In [3]:
import pandas as pd
import numpy as np

In [4]:
cars = pd.read_csv('/content/drive/My Drive/Colab Notebooks/db/craiglist_cars-1_3c787db8-43e4-4b25-811b-b6cbe93cb9bf.csv')
cars.head(3)

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,title_status,transmission,drive,size,type,paint_color,price
0,2009,chevrolet,good,8 cylinders,gas,clean,automatic,rwd,full-size,SUV,white,9000
1,2002,gmc,good,8 cylinders,gas,clean,automatic,4wd,,pickup,white,6000
2,2007,pontiac,excellent,4 cylinders,gas,clean,automatic,fwd,compact,convertible,red,7000


In [5]:
print(cars.shape)
100*cars.isnull().sum()/cars.shape[0]

(434542, 12)


year             0.000000
manufacturer     3.646138
condition       36.860649
cylinders       28.306355
fuel             0.839505
title_status     0.662997
transmission     0.989087
drive           13.280189
size            58.133621
type            11.566661
paint_color     19.734571
price            0.000000
dtype: float64

In [6]:
types = pd.DataFrame(cars.dtypes)
print(types.groupby(0).size())

0
int64      2
object    10
dtype: int64


In [7]:
categorical = types.index[types[0] == 'O'].values
for line in categorical:
 print("Variable "+ line +" contains:",str(len(cars[line].unique()))+" distinct values")

Variable manufacturer contains: 43 distinct values
Variable condition contains: 7 distinct values
Variable cylinders contains: 9 distinct values
Variable fuel contains: 6 distinct values
Variable title_status contains: 7 distinct values
Variable transmission contains: 4 distinct values
Variable drive contains: 4 distinct values
Variable size contains: 5 distinct values
Variable type contains: 14 distinct values
Variable paint_color contains: 13 distinct values


# Data Engineering

In [8]:
cars["fuel"] = cars["fuel"].fillna(cars["fuel"].mode()[0])

In [9]:
print(cars.shape)
100*cars.isnull().sum()/cars.shape[0]

(434542, 12)


year             0.000000
manufacturer     3.646138
condition       36.860649
cylinders       28.306355
fuel             0.000000
title_status     0.662997
transmission     0.989087
drive           13.280189
size            58.133621
type            11.566661
paint_color     19.734571
price            0.000000
dtype: float64

In [10]:
cars["title_status"] = cars["title_status"].fillna(cars["title_status"].mode()[0])
cars["transmission"] = cars["transmission"].fillna(cars["transmission"].mode()[0])
cars["manufacturer"] = cars["manufacturer"].fillna(cars["manufacturer"].mode()[0])

100*cars.isnull().sum()/cars.shape[0]

year             0.000000
manufacturer     0.000000
condition       36.860649
cylinders       28.306355
fuel             0.000000
title_status     0.000000
transmission     0.000000
drive           13.280189
size            58.133621
type            11.566661
paint_color     19.734571
price            0.000000
dtype: float64

In [12]:
df= cars.copy()
for col in categorical:
    df = pd.concat([df, (pd.get_dummies(df[col])).astype(int)], axis=1)
    df.drop(columns=[col],inplace=True)

df.drop('other', axis=1, inplace=True)
print(df.shape)
df.head(3)

(434542, 100)


Unnamed: 0,year,price,acura,alfa-romeo,aston-martin,audi,bmw,buick,cadillac,chevrolet,chrysler,datsun,dodge,ferrari,fiat,ford,gmc,harley-davidson,hennessey,honda,hyundai,infiniti,jaguar,jeep,kia,land rover,lexus,lincoln,mazda,mercedes-benz,mercury,mini,mitsubishi,morgan,nissan,pontiac,porche,ram,rover,saturn,...,hybrid,clean,lien,missing,parts only,rebuilt,salvage,automatic,manual,4wd,fwd,rwd,compact,full-size,mid-size,sub-compact,SUV,bus,convertible,coupe,hatchback,mini-van,offroad,pickup,sedan,truck,van,wagon,black,blue,brown,custom,green,grey,orange,purple,red,silver,white,yellow
0,2009,9000,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,2002,6000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,2007,7000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [13]:
types = pd.DataFrame(df.dtypes)
print("Tipos de variables",types.groupby(0).size())

Tipos de variables 0
int64    100
dtype: int64


In [14]:
numeric_columns = list(set(types.index[types[0] =="int64"].values) - set(["price"]))
consolidated_variables = df[numeric_columns]
target = df["price"] 

consolidated_variables.head(3)

Unnamed: 0,hennessey,6 cylinders,harley-davidson,pickup,truck,sub-compact,jaguar,infiniti,gas,good,offroad,lincoln,5 cylinders,SUV,manual,lien,datsun,mitsubishi,white,rover,red,fwd,kia,nissan,volvo,compact,hyundai,orange,mid-size,salvage,salvage.1,buick,chrysler,coupe,blue,yellow,clean,grey,purple,hybrid,...,lexus,full-size,wagon,mazda,like new,fair,ford,gmc,volkswagen,toyota,new,ram,convertible,aston-martin,morgan,bus,year,pontiac,subaru,missing,custom,audi,mini,4 cylinders,green,chevrolet,cadillac,fiat,saturn,3 cylinders,land rover,automatic,electric,brown,mini-van,mercury,alfa-romeo,rebuilt,excellent,silver
0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2009,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2002,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2007,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


# Model Training

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(consolidated_variables,target,test_size=0.2, random_state=2020)
x_train,x_val, y_train,y_val = train_test_split(x_train,y_train,test_size=0.1, random_state=2020)

In [24]:
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)
y_val = y_val.values.reshape(-1,1)

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

scaler1 = StandardScaler()
scaler1.fit(y_train)
y_train_scaled = scaler1.transform(y_train)
y_val_scaled = scaler1.transform(y_val)
y_test_scaled = scaler1.transform(y_test)

In [26]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import plot_model

model = Sequential()
model.add(Dense(256,input_dim = x_train.shape[1],activation="relu"))
model.add(Dense(128,activation="relu"))
model.add(Dense(128,activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1,activation = "linear")) 
model.compile(optimizer = "adam",loss="mse",metrics=["mean_absolute_error"])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               25600     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 75,137
Trainable params: 75,137
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
model.fit(x_train_scaled,y_train_scaled, validation_data = (x_val_scaled,y_val_scaled),epochs=50, batch_size=1024)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f49fa460630>

# Model Evaluation

In [36]:
result = model.evaluate(x_test_scaled,y_test_scaled)
for i in range(len(model.metrics_names)):
 print("Metric ",model.metrics_names[i],":",
str(round(result[i],2)))

Metric  loss : 0.38
Metric  mean_absolute_error : 0.39


In [37]:
real=pd.DataFrame(y_train)
predic=model.predict(pd.DataFrame(x_train_scaled))
resc_values = scaler1.inverse_transform(predic)
pred_escal =pd.DataFrame(resc_values)

for i in range(0,5):
	print("Real=%s, Prediction=%s" % (real[0][i], pred_escal[0][i]))

Real=18650, Prediction=15583.775
Real=9950, Prediction=8940.825
Real=2000, Prediction=2274.2212
Real=7999, Prediction=5148.329
Real=23999, Prediction=23864.508
