In [6]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [8]:
train = pd.read_csv('../input/car_train.csv')
test = pd.read_csv('../input/car_test.csv')
rides = pd.read_csv('../input/rides_info.csv')
fixes = pd.read_csv('../input/fix_info.csv')
drivers = pd.read_csv('../input/driver_info.csv')


In [9]:
sub_rides = rides[['car_id', 'user_id']]
sub_rides = sub_rides.drop_duplicates(subset=['car_id'], keep='last')
train = pd.merge(train, sub_rides, on=['car_id', 'car_id'], how='left')
test = pd.merge(test, sub_rides, on=['car_id', 'car_id'], how='left')

In [10]:
features2drop = ['car_id', 'target_class', 'user_id', 'ride_id', 'ride_date', 'refueling',
                'ride_cost', 'ride_duration', 'distance'] 
target = ['target_reg']  
cat_features = ['car_type', 'fuel_type', 'model'] 

filtered_features = [i for i in train.columns if (i not in target and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]


print('cat_features :', len(cat_features), cat_features)
print('num_features :', len(num_features), num_features)
print('targets', target)

cat_features : 3 ['car_type', 'fuel_type', 'model']
num_features : 4 ['car_rating', 'year_to_start', 'riders', 'year_to_work']
targets ['target_reg']


In [11]:
f = lambda x: x.nunique()
rides_df_gr = rides.groupby('car_id', as_index=False).agg(mean_rating = ('rating', 'mean'),
                                                   distance_sum = ('distance', 'sum'),
                                                   rating_min = ('rating', 'min'),
                                                   speed_max = ('speed_max', 'max'),
                                                   user_ride_quality_median = ('user_ride_quality', 'median'), 
                                                   user_uniq = ('user_id', f))


rides_df_gr.head(2)

def add_featues(df):
    
    if 'mean_rating' not in df.columns:
        return df.merge(rides_df_gr, on = 'car_id', how = 'left')
    
    return df
    
    
train = add_featues(train)
test = add_featues(test)

filtered_features = [i for i in train.columns if (i not in target and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

In [12]:
fixes_df_gr = fixes.groupby('car_id', as_index=False).agg(mean_destroy_degree = ('destroy_degree', 'mean'),
                                                   mean_work_duration = ('work_duration', 'mean'))


fixes_df_gr.head(2)
def addd_featues(df):
    
    if 'mean_destroy_degree' not in df.columns:
        return df.merge(fixes_df_gr, on = 'car_id', how = 'left')
    
    return df
    
    
train = addd_featues(train)
test = addd_featues(test)

filtered_features = [i for i in train.columns if (i not in target and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

In [13]:
drivers_df_gr = drivers.groupby('user_id', as_index=False).agg(mean_user_time_accident = ('user_time_accident', 'mean'),
                                                              mean_sex = ('sex', 'mean'),
                                                              mean_user_rides = ('user_rides', 'mean'),
                                                              mean_user_rating = ('user_rating', 'mean'))


drivers_df_gr.head(2)
def add_featues(df):
    
    if 'mean_user_time_accident' not in df.columns:
        return df.merge(drivers_df_gr, on = 'user_id', how = 'left')
    
    return df
    
    
train = add_featues(train)
test = add_featues(test)

In [14]:
train['distpertime'] = train['distance_sum'] / (train['year_to_work'] - train['year_to_start'])
test['distpertime'] = test['distance_sum'] / (test['year_to_work'] - test['year_to_start'])

In [15]:
filtered_features = [i for i in train.columns if (i not in target and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

print('num_features :', len(num_features), num_features)
print('cat_features :', len(cat_features), cat_features)
print('targets', target)

num_features : 17 ['car_rating', 'year_to_start', 'riders', 'year_to_work', 'mean_rating', 'distance_sum', 'rating_min', 'speed_max', 'user_ride_quality_median', 'user_uniq', 'mean_destroy_degree', 'mean_work_duration', 'mean_user_time_accident', 'mean_sex', 'mean_user_rides', 'mean_user_rating', 'distpertime']
cat_features : 3 ['car_type', 'fuel_type', 'model']
targets ['target_reg']


In [18]:
X = train[filtered_features]
y = train['target_reg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

model = CatBoostRegressor(
    random_seed=42, 
    thread_count=-1, 
    cat_features=cat_features
)

model.fit(
    train_pool,
    eval_set=test_pool,
    verbose=100,
    use_best_model=True,
    plot=False,
    early_stopping_rounds=100,
)

idxs, scores = model.get_object_importance(
    test_pool, train_pool, importance_values_sign="Positive", thread_count=-1
)

Learning rate set to 0.056174
0:	learn: 16.8266722	test: 17.7075663	best: 17.7075663 (0)	total: 30.8ms	remaining: 30.7s
100:	learn: 8.2837355	test: 10.0572329	best: 10.0572329 (100)	total: 3.27s	remaining: 29.1s
200:	learn: 7.2696832	test: 9.9386588	best: 9.9223614 (188)	total: 6.13s	remaining: 24.4s
300:	learn: 6.3744269	test: 9.9421949	best: 9.9112137 (223)	total: 9.08s	remaining: 21.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 9.911213699
bestIteration = 223

Shrink model to first 224 iterations.


In [19]:
y_pred = model.predict(X_test)
print(f'\n RMSE: {mean_squared_error(y_test, y_pred) ** 0.5}')


 RMSE: 9.911213745524025


In [20]:
test['target_reg'] = model.predict(test[filtered_features])
test[['car_id', 'target_reg']].to_csv('output.csv', index=False)
print('CSV file saved!')

CSV file saved!
