# Data

In [25]:
import pandas as pd
import numpy as np

from sklearn import neighbors
from catboost import CatBoostRegressor
from sklearn import tree                             

from sklearn.model_selection import train_test_split

In [2]:
%store -r df

preX = pd.concat(
[df[['brand','sell_id','price','engineDisplacement', 'enginePower','mileage','car_age', 'model_product_time']] ,
pd.get_dummies(df.bodyType,prefix='bodyType'),
pd.get_dummies(df.fuelType,prefix='fuelType'),
pd.get_dummies(df.vehicleTransmission,prefix='vehicleTransmission'),
pd.get_dummies(df.Привод,prefix='Привод'),
pd.get_dummies(df.Владельцы,prefix='Владельцы'),
pd.get_dummies(df.Владение,prefix='Владение'),
pd.get_dummies(df.color,prefix='color'),
pd.get_dummies(df.ПТС,prefix='ПТС'),
pd.get_dummies(df.Руль,prefix='Руль'),
pd.get_dummies(df.numberOfDoors,prefix='numberOfDoors'),],axis=1)

preX.sample(5)

test = preX[preX.price==-1]
train = preX[preX.price!=-1]

X=train.drop(['brand','sell_id','price'],axis='columns')
y=train.price

### Метрика
$$\Large MAPE= 100 \% * \frac{1}{n}\sum_{t=1}^{n}\frac{\left | Y_t-\hat{Y_t} \right |}{Y_t}$$

In [3]:
'''Функция вывода MAPE'''
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))*100

### Cравнение Decision Tree & CatBoost

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X ,
    y, test_size=0.2, shuffle=True, random_state=42)

In [29]:
### k-nears neighbors regression
knn = neighbors.KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)
print('MAPE:', mape(y_test, knn.predict(X_test)))

42.47134987692746


In [5]:
### CatBoost regression
model = CatBoostRegressor(iterations = 10000,
                          random_seed = 42, silent=True)
model.fit(X_train.values, np.log(y_train.values))

print('MAPE:',mape(y_test, np.exp(model.predict(X_test.values)) ))

MAPE: 15.51307854541574


In [6]:
### Decision Tree regression
model = tree.DecisionTreeRegressor(max_depth=16, max_features=None)
model.fit(X_train,y_train)

print('MAPE:',(mape(y_test, (model.predict(X_test)))))

MAPE: 19.353506235490155


CatBoost показал себя лучше простого решающего дерева, можно сделать первый коммит

In [7]:
'''SUBMISSION_1'''
model = CatBoostRegressor(iterations = 10000,
                          random_seed = 42, silent=True)
model.fit(X.values, np.log(y.values))

price = pd.Series(np.exp(model.predict(test.drop(['brand','sell_id','price'],axis='columns').values)))
price.name = 'price'
price = price.astype('int')
pd.concat([test.sell_id, price],axis=1).to_csv(f'submission_1.csv', index=False)
print('KAGGLE score: 15.81082')

KAGGLE score: 15.81082


# Далее модели обучались сразу для предсказаний на Kaggle

In [8]:
'''                 Подготовка датасетов по маркам                          '''
###############################################################################
train_SKODA = train[train.brand == 'SKODA'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_AUDI = train[train.brand == 'AUDI'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_HONDA = train[train.brand == 'HONDA'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_VOLVO = train[train.brand == 'VOLVO'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_BMW = train[train.brand == 'BMW'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_NISSAN = train[train.brand == 'NISSAN'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_INFINITI = train[train.brand == 'INFINITI'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_MERCEDES = train[train.brand == 'MERCEDES'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_TOYOTA = train[train.brand == 'TOYOTA'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_LEXUS = train[train.brand == 'LEXUS'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_VOLKSWAGEN = train[train.brand == 'VOLKSWAGEN'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)
train_MITSUBISHI = train[train.brand == 'MITSUBISHI'].drop(['brand','sell_id'],axis='columns').reset_index(drop=True)


'''                 Обучение деревьев по бренду                             '''
###############################################################################
SKODA = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
SKODA.fit(train_SKODA.drop(['price'],axis='columns'),np.log(train_SKODA.price))

AUDI = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
AUDI.fit(train_AUDI.drop(['price'],axis='columns'),np.log(train_AUDI.price))

HONDA = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
HONDA.fit(train_HONDA.drop(['price'],axis='columns'),np.log(train_HONDA.price))

VOLVO = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
VOLVO.fit(train_VOLVO.drop(['price'],axis='columns'),np.log(train_VOLVO.price))

BMW = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
BMW.fit(train_BMW.drop(['price'],axis='columns'),np.log(train_BMW.price))

NISSAN = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
NISSAN.fit(train_NISSAN.drop(['price'],axis='columns'),np.log(train_NISSAN.price))

INFINITI = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
INFINITI.fit(train_INFINITI.drop(['price'],axis='columns'),np.log(train_INFINITI.price))

MERCEDES = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
MERCEDES.fit(train_MERCEDES.drop(['price'],axis='columns'),np.log(train_MERCEDES.price))

TOYOTA = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
TOYOTA.fit(train_TOYOTA.drop(['price'],axis='columns'),np.log(train_TOYOTA.price))

LEXUS = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
LEXUS.fit(train_LEXUS.drop(['price'],axis='columns'),np.log(train_LEXUS.price))

VOLKSWAGEN = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
VOLKSWAGEN.fit(train_VOLKSWAGEN.drop(['price'],axis='columns'),np.log(train_VOLKSWAGEN.price))

MITSUBISHI = tree.DecisionTreeRegressor(max_depth=None, max_features=None)
MITSUBISHI.fit(train_MITSUBISHI.drop(['price'],axis='columns'),np.log(train_MITSUBISHI.price))


'''             Предсказания деревьев по бренду                             '''
###############################################################################
for_pred = test.drop(['brand','sell_id','price'],axis='columns')

preds = []
for i in test.index:
    if test.brand[i] == 'SKODA': preds.append(SKODA.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'AUDI': preds.append(AUDI.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'HONDA': preds.append(HONDA.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'VOLVO': preds.append(VOLVO.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'BMW': preds.append(BMW.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'NISSAN': preds.append(NISSAN.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'INFINITI': preds.append(INFINITI.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'MERCEDES': preds.append(MERCEDES.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'TOYOTA': preds.append(TOYOTA.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'LEXUS': preds.append(LEXUS.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'VOLKSWAGEN': preds.append(VOLKSWAGEN.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
    if test.brand[i] == 'MITSUBISHI': preds.append(MITSUBISHI.predict(pd.DataFrame(for_pred.iloc[i]).T)[0])
        
big_pred = np.exp(preds) # Массив с предскзаниями

Как совмещать предсказания? 
- Арифметическим средним. Складываются два или более предсказания, после чего их сумма делится на их же количество

In [9]:
( ((12+7)/2 )+3)/2, (12+7+3)/3 # пример регуляции влияния предсказаний

(6.25, 7.333333333333333)

In [10]:
'''SUBMISSION 2 - совмещение CatBoost и Decision Trees'''
pd.concat([test.sell_id, (price + big_pred)/2],axis=1).to_csv(
    f'submission_2.csv', index=False)
print('KAGGLE score: 12.82608')

KAGGLE score: 12.82608


### Время появиться lil_trees.
Он обучался отдельно на моделях авто

In [11]:
lil_pred = pd.read_pickle('lil_pred.pickle') # это его предсказания

In [12]:
# Cтаршие алгоритмы подсказывают младшему авто, моделей которых нет в датасете
for i in lil_pred[lil_pred==0.36787944117144233].index:
    lil_pred.at[i] = (big_pred[i] + price[i])/2

In [13]:
'''SUBMISSION 3'''
prediction = ((lil_pred + big_pred + price)/3).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_3.csv', index=False)

In [14]:
'''SUBMISSION 4'''
prediction = ((lil_pred + big_pred)/2).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_4.csv', index=False)

In [15]:
'''SUBMISSION 5'''
prediction = (  ( ((price + big_pred)/2) + lil_pred )/2  ).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_5.csv', index=False)

In [16]:
'''SUBMISSION 6'''
prediction = ( (price + lil_pred )/2 ).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_6.csv', index=False)

In [17]:
'''SUBMISSION 7'''
prediction = ( ((big_pred + lil_pred )/2 + lil_pred)/2 ).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_7.csv', index=False)

In [18]:
'''SUBMISSION 8'''
prediction = (lil_pred).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_8.csv', index=False)

In [21]:
'''SUBMISSION 9'''
prediction = pd.Series(big_pred).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_9.csv', index=False)

In [22]:
'''SUBMISSION 10'''
prediction = (  (  ( (big_pred + lil_pred)/2) + big_pred)/2  ).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_10.csv', index=False)

Возможно, что лучше всего заполнять пропуски именно big

In [23]:
lil_pred = pd.read_pickle('lil_pred.pickle')

for i in lil_pred[lil_pred==0.36787944117144233].index:
    lil_pred.at[i] = big_pred[i]
    
    
'''SUBMISSION 11'''
prediction = ((lil_pred + big_pred)/2).astype('int')
prediction.name = 'price'


pd.concat([test.sell_id, prediction],axis=1).to_csv(
    'submission_11.csv', index=False)

Нет, это маловажно

Если спросите, почему я не обучал перспективный CatBoost на разделенных по брендам или моделям данных, то я отвечу, что не успел посмотреть на данную комбинацию.