In [1]:
import pandas as pd
import numpy as np
cars = pd.read_csv('cars.csv')
test = pd.read_csv('test.csv')

In [2]:
print('Test columns:')
print(test.columns)
print()
print('Cars columns:')
print(cars.columns)

Test columns:
Index(['bodyType', 'brand', 'color', 'fuelType', 'modelDate', 'name',
       'numberOfDoors', 'productionDate', 'vehicleConfiguration',
       'vehicleTransmission', 'engineDisplacement', 'enginePower',
       'description', 'mileage', 'Комплектация', 'Привод', 'Руль', 'Состояние',
       'Владельцы', 'ПТС', 'Таможня', 'Владение', 'id'],
      dtype='object')

Cars columns:
Index(['manufacturer_name', 'model_name', 'transmission', 'color',
       'odometer_value', 'year_produced', 'engine_fuel', 'engine_has_gas',
       'engine_type', 'engine_capacity', 'body_type', 'has_warranty', 'state',
       'drivetrain', 'price_usd', 'is_exchangeable', 'location_region',
       'number_of_photos', 'up_counter', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8', 'feature_9', 'duration_listed'],
      dtype='object')


In [3]:
color_dict = {
    'чёрный': 'black',
    'белый': 'white',
    'серый': 'gray',
    'серебристый': 'silver',
    'синий': 'blue',
    'коричневый': 'gray',
    'красный': 'red',
    'зелёный': 'green',
    'бежевый': 'brown',
    'голубой': 'blue',
    'пурпурный': 'red',
    'золотистый': 'other',
    'фиолетовый': 'violet',
    'жёлтый': 'yellow',
    'оранжевый': 'orange',
    'розовый': 'other'
}
fuel_dict = {
    'gasoline': 'бензин',
    'diesel': 'дизель',
    'gas': 'газ',
    'hybrid-petrol': 'гибрид',
    'electric': 'электро',
    'hybrid-diesel': 'гибрид'
}
body_dict = {
    'внедорожник 5 дв.': 'suv',
    'седан': 'sedan',
    'лифтбек': 'liftback',
    'хэтчбек 5 дв.': 'hatchback',
    'универсал 5 дв.': 'universal',
    'минивэн': 'minivan',
    'купе': 'coupe',
    'компактвэн': 'minivan',
    'хэтчбек 3 дв.': 'hatchback',
    'пикап двойная кабина': 'pickup',
    'внедорожник 3 дв.': 'suv',
    'купе-хардтоп': 'coupe',
    'кабриолет': 'cabriolet',
    'фургон': 'van',
    'родстер': 'coupe',
    'микровэн': 'minivan',
    'седан-хардтоп': 'sedan',
    'пикап одинарная кабина': 'pickup',
    'пикап полуторная кабина': 'pickup',
    'внедорожник открытый': 'suv',
    'лимузин': 'limousine',
    'тарга': 'cabriolet',
    'седан 2 дв.': 'coupe'
}
transmission_dict = {
    'автоматическая': 'automatic',
    'механическая': 'mechanical',
    'роботизированная': 'automatic',
    'вариатор': 'automatic'
}
def calculate_capacity(hp):
    capacity = str(hp).split()[0]
    if capacity == 'undefined':
        return 0
    return int(float(capacity)//0.2)

cars['engine_fuel'] = cars['engine_fuel'].apply(lambda x:fuel_dict[x])
cars['manufacturer_name'] = cars['manufacturer_name'].apply(lambda x:x.upper())
test['color'] = test['color'].apply(lambda x:color_dict[x])
test['bodyType'] = test['bodyType'].apply(lambda x:body_dict[x])
test['vehicleTransmission'] = test['vehicleTransmission'].apply(lambda x:transmission_dict[x])
test['modelDate'] = test['modelDate'].apply(lambda x:int(x))
test['mileage'] = test['mileage'].apply(lambda x:int(x))
test['engineDisplacement'] = test['engineDisplacement'].apply(calculate_capacity).astype('str')

test = test.rename(columns={'bodyType': 'body_type',
                            'brand': 'manufacturer_name',
                            'mileage': 'odometer_value',
                            'vehicleTransmission': 'transmission',
                            'modelDate': 'year_produced',
                            'fuelType': 'engine_fuel',
                            'engineDisplacement': 'engine_capacity'
})


In [4]:
ITERATIONS = 1500
LR         = 0.1

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
model = CatBoostRegressor(iterations = ITERATIONS,
                          learning_rate = LR,
                          random_seed = 42,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE']
                         )

In [5]:
X = cars[['body_type',
        'manufacturer_name',
        'transmission',
        'year_produced',
        'engine_fuel',
        'color',
        #'model_name',
        'engine_capacity',
        'odometer_value'
]]
y = cars['price_usd']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [7]:
model.fit(X_train, y_train,
         cat_features=[0,1,2,3,4,5,6],
         eval_set=(X_test, y_test),
         verbose_eval=100,
         use_best_model=True,
         plot=True
         )

CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=6]=1.5 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [None]:
test_pred = test[['body_type',
        'manufacturer_name',
        'transmission',
        'year_produced',
        'engine_fuel',
        'color',
        #'model_name',
        'engine_capacity',
        'odometer_value'
]]
pred = model.predict(test_pred)

In [None]:
pd.Series(pred)

In [None]:
sample_submission = pd.DataFrame()
sample_submission['price'] = pred
sample_submission['price'] = sample_submission['price'].apply(lambda x:x*63,5)
sample_submission.head(10)

In [None]:
sample_submission = sample_submission.reset_index().rename(columns={'index': 'id'})

In [None]:
sample_submission.to_csv(f'submission_v02.csv', index=False)
sample_submission.head()

In [None]:
test_pred.head()

In [None]:
random_car = ['hatchback', 'HONDA', 'automatic', 2012, 'бензин', 'grey', 1.7, 126089]
price = model.predict(random_car)
price*65

In [None]:
test['engineDisplacement'].value_counts()