In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e9/sample_submission.csv
/kaggle/input/playground-series-s4e9/train.csv
/kaggle/input/playground-series-s4e9/test.csv


In [2]:
!pip install -q xgboost
print('done')

done


In [3]:
%%time
import numpy as np, pandas as pd, xgboost as xgb
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

CPU times: user 7.5 s, sys: 1.24 s, total: 8.73 s
Wall time: 17.7 s


In [4]:
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [5]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [6]:
print(f"{train.shape}\n{test.shape}")

(188533, 13)
(125690, 12)


In [7]:
info = pd.DataFrame({'train_nans' : [train[col].isna().sum() for col in train.columns.to_list() if col!='price'],
                    'test_nans' : [test[col].isna().sum() for col in test.columns.to_list()],
                    'diff_train' : [len(set(train[col])) for col in train.columns.to_list() if col!='price'],
                    'diff_test' : [len(set(test[col])) for col in test.columns.to_list()]}, index=test.columns.to_list(),
)
info

Unnamed: 0,train_nans,test_nans,diff_train,diff_test
id,0,0,188533,125690
brand,0,0,57,55
model,0,0,1897,1891
model_year,0,0,34,36
milage,0,0,6651,5700
fuel_type,5083,3383,8,8
engine,0,0,1117,1117
transmission,0,0,52,52
ext_col,0,0,319,317
int_col,0,0,156,156


In [8]:
for col in ['model', 'engine']:
    train = train.drop(col, axis=1)
    test = test.drop(col, axis=1)
train = train.drop('id', axis=1)
X_test = test.iloc[:,1:]
print('done')

done


In [9]:
enc_cols = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
for col in enc_cols:
    train[col] = LabelEncoder().fit_transform(train[col])
    X_test[col] = LabelEncoder().fit_transform(X_test[col])
print('done')

done


In [10]:
num_cols = ['model_year', 'milage']
for col in num_cols:
    train[[col]] = StandardScaler().fit_transform(train[[col]])
    X_test[[col]] = StandardScaler().fit_transform(X_test[[col]])
print('done')

done


In [11]:
reg = LinearRegression()
gb = xgb.XGBRegressor()
forest = RandomForestRegressor()
print('done')

done


In [12]:
net = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1)
])

net.compile(optimizer='adam',
           loss='mse',
           metrics=['mse'])
print('done')

done


In [15]:
reg.fit(train.iloc[:,:-1], train.iloc[:,-1])
gb.fit(train.iloc[:,:-1], train.iloc[:,-1])
forest.fit(train.iloc[:,:-1], train.iloc[:,-1])
print('done')

done


In [16]:
net.fit(train.iloc[:,:-1], train.iloc[:,-1], epochs=50, batch_size=64, verbose=0)
print('done')

done


In [17]:
preds1 = reg.predict(X_test)
preds2 = gb.predict(X_test)
preds3 = forest.predict(X_test)
preds4 = net.predict(X_test)
print('done')

[1m3928/3928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
done


In [19]:
preds = (preds1 + preds2 + preds3 + preds4.flatten()) / 4
print('done')

done


In [20]:
output = pd.DataFrame({'id' : test['id'],
                      'price' : preds})
output.to_csv('submission.csv', index=False)
print('done')

done
