In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from keras import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation

In [6]:
from keras.optimizers import Adam

In [7]:
from keras.callbacks import EarlyStopping

In [8]:
from keras.regularizers import l1, l2, l1_l2

In [9]:
train = pd.read_csv('train.csv')

In [10]:
test = pd.read_csv('test.csv')

In [11]:
train.head()

Unnamed: 0,model,year,motor_type,running,wheel,color,type,status,motor_volume,price
0,toyota,2022,petrol,3000 km,left,skyblue,sedan,excellent,2.0,24500
1,mercedes-benz,2014,petrol,132000 km,left,black,sedan,excellent,2.0,25500
2,kia,2018,petrol,95000 miles,left,other,sedan,excellent,2.0,11700
3,mercedes-benz,2002,petrol,137000 miles,left,golden,sedan,excellent,3.2,12000
4,mercedes-benz,2017,petrol,130000 km,left,black,sedan,good,2.0,26000


In [12]:
test.head()

Unnamed: 0,Id,model,year,motor_type,running,wheel,color,type,status,motor_volume
0,0,kia,2020,petrol,24000 km,left,black,sedan,excellent,2.0
1,1,nissan,2017,petrol,85000 miles,left,white,suv,excellent,2.0
2,2,hyundai,2021,petrol,30000 miles,left,white,sedan,excellent,2.0
3,3,kia,2018,petrol,53000 miles,left,silver,sedan,excellent,2.0
4,4,mercedes-benz,2003,petrol,230000 km,left,black,Universal,normal,1.8


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642 entries, 0 to 1641
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         1642 non-null   object 
 1   year          1642 non-null   int64  
 2   motor_type    1642 non-null   object 
 3   running       1642 non-null   object 
 4   wheel         1642 non-null   object 
 5   color         1642 non-null   object 
 6   type          1642 non-null   object 
 7   status        1642 non-null   object 
 8   motor_volume  1642 non-null   float64
 9   price         1642 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 128.4+ KB


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411 entries, 0 to 410
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            411 non-null    int64  
 1   model         411 non-null    object 
 2   year          411 non-null    int64  
 3   motor_type    411 non-null    object 
 4   running       411 non-null    object 
 5   wheel         411 non-null    object 
 6   color         411 non-null    object 
 7   type          411 non-null    object 
 8   status        411 non-null    object 
 9   motor_volume  411 non-null    float64
dtypes: float64(1), int64(2), object(7)
memory usage: 32.2+ KB


In [15]:
test = test.drop('Id', axis = 1)

In [16]:
for i in ['model', 'motor_type', 'wheel', 'color', 'type', 'status']:
    print(i, set(test[i].unique()).difference(set(train[i].unique())))

model set()
motor_type set()
wheel {'right'}
color set()
type set()
status set()


In [17]:
train = train.drop('wheel', axis = 1)
test = test.drop('wheel', axis = 1)

In [18]:
label_encoders = {}

In [19]:
for i in ['model', 'motor_type', 'color', 'type', 'status']:
    label = LabelEncoder()
    train[i] = label.fit_transform(train[i])
    test[i] = label.transform(test[i])
    label_encoders[i] = label

In [20]:
train.running = train.running.apply(lambda p: float(p.split(' ')[0]) if p.split(' ')[-1] == 'km' else 1.6 * float(p.split(' ')[0]))
test.running = test.running.apply(lambda p: float(p.split(' ')[0]) if p.split(' ')[-1] == 'km' else 1.6 * float(p.split(' ')[0]))

In [21]:
scaler = StandardScaler()

In [22]:
X = train.drop('price', axis = 1)

In [23]:
X

Unnamed: 0,model,year,motor_type,running,color,type,status,motor_volume
0,4,2022,3,3000.0,15,5,1,2.0
1,2,2014,3,132000.0,1,5,1,2.0
2,1,2018,3,152000.0,10,5,1,2.0
3,2,2002,3,219200.0,6,5,1,3.2
4,2,2017,3,130000.0,1,5,2,2.0
...,...,...,...,...,...,...,...,...
1637,0,2017,3,192000.0,16,5,2,2.0
1638,4,2014,3,170000.0,1,5,2,2.0
1639,3,2018,3,110240.0,2,6,2,2.0
1640,3,2019,3,49600.0,1,6,1,2.0


In [24]:
y = train.price

In [25]:
y

0       24500
1       25500
2       11700
3       12000
4       26000
        ...  
1637    12400
1638    16500
1639    19500
1640    19500
1641    28500
Name: price, Length: 1642, dtype: int64

In [26]:
Z = test

In [27]:
Z

Unnamed: 0,model,year,motor_type,running,color,type,status,motor_volume
0,1,2020,3,24000.0,1,5,1,2.0
1,3,2017,3,136000.0,16,6,1,2.0
2,0,2021,3,48000.0,16,5,1,2.0
3,1,2018,3,84800.0,14,5,1,2.0
4,2,2003,3,230000.0,1,1,4,1.8
...,...,...,...,...,...,...,...,...
406,3,2021,3,33400.0,1,6,1,2.0
407,0,2017,3,96000.0,10,5,1,2.0
408,2,2012,3,218000.0,16,5,2,2.0
409,1,2020,3,64000.0,13,5,2,2.0


In [28]:
X = scaler.fit_transform(X)
Z = scaler.transform(Z)

In [29]:
X

array([[ 1.52639053,  1.09243073,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ],
       [ 0.05769246, -0.12234706,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ],
       [-0.67665657,  0.48504183,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ],
       ...,
       [ 0.79204149,  0.48504183,  0.26699337, ...,  1.08693988,
         0.97583205, -0.1384163 ],
       [ 0.79204149,  0.63688906,  0.26699337, ...,  1.08693988,
        -0.45353122, -0.1384163 ],
       [ 1.52639053,  1.09243073,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ]])

In [30]:
Z

array([[-0.67665657,  0.78873628,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ],
       [ 0.79204149,  0.33319461,  0.26699337, ...,  1.08693988,
        -0.45353122, -0.1384163 ],
       [-1.4110056 ,  0.9405835 ,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ],
       ...,
       [ 0.05769246, -0.42604151,  0.26699337, ..., -0.03413756,
         0.97583205, -0.1384163 ],
       [-0.67665657,  0.78873628,  0.26699337, ..., -0.03413756,
         0.97583205, -0.1384163 ],
       [-1.4110056 ,  0.33319461,  0.26699337, ..., -0.03413756,
        -0.45353122, -0.1384163 ]])

In [31]:
X.shape

(1642, 8)

In [32]:
Z.shape

(411, 8)

In [33]:
X = X.reshape(-1, 1, 8)
Z = Z.reshape(-1, 1, 8)

In [34]:
X.shape

(1642, 1, 8)

In [35]:
Z.shape

(411, 1, 8)

In [67]:
early_stop = EarlyStopping(monitor = 'val_loss', patience = 100, restore_best_weights=True)

In [68]:
model = Sequential()

model.add(Dense(units = 256, kernel_regularizer = l1_l2(0.005), activation = 'relu', input_shape = (1, 8)))
model.add(Dense(units = 128, kernel_regularizer = l1_l2(0.005), activation = 'relu'))
model.add(Dense(units = 64, kernel_regularizer = l1_l2(0.005), activation = 'relu'))
model.add(Dense(units = 32, kernel_regularizer = l1_l2(0.005), activation = 'relu'))

model.add(Dense(units = 1, activation = 'linear'))

In [69]:
model.compile(optimizer=Adam(learning_rate=0.0005), loss='mae', metrics = ['mae'])

In [70]:
model.fit(X, y, validation_split=0.2, epochs = 1000, batch_size=32, callbacks=[early_stop])

Epoch 1/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 15917.2949 - mae: 15902.3848 - val_loss: 16580.8164 - val_mae: 16568.3027
Epoch 2/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 15966.5908 - mae: 15954.5000 - val_loss: 16529.4180 - val_mae: 16517.6680
Epoch 3/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 15557.4121 - mae: 15545.2461 - val_loss: 16073.5840 - val_mae: 16059.1934
Epoch 4/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 14940.4023 - mae: 14924.7998 - val_loss: 13841.4785 - val_mae: 13821.2930
Epoch 5/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 12029.5840 - mae: 12007.6562 - val_loss: 7421.1665 - val_mae: 7394.0684
Epoch 6/1000
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 5814.4648 - mae: 5785.9858 - val_loss: 3613.5095 - val_mae: 3583.590

<keras.src.callbacks.history.History at 0x285d9acaac0>

In [188]:
preds = model.predict(Z)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [189]:
sub = pd.read_csv('sample_submission.csv')

In [190]:
sub.head()

Unnamed: 0,Id,price
0,0,38
1,1,20549
2,2,3927
3,3,5572
4,4,28598


In [191]:
preds = preds.reshape(411, )

In [192]:
preds.shape

(411,)

In [193]:
sub['price'] = preds

In [194]:
sub.head()

Unnamed: 0,Id,price
0,0,18020.107422
1,1,16292.84375
2,2,24402.878906
3,3,15501.880859
4,4,9931.416016


In [195]:
sub.to_csv('sub_.csv', index=False, index_label=False)