In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [2]:
url1 = 'https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/car_train.csv'
df = pd.read_csv(url1)

In [3]:
df.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,9492.96,another_bug
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,2656.23,electro_bug
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,1526.11,gear_stick
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,1338.0,engine_fuel
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,825.72,engine_fuel


# Start with basic dataframe to get Baseline

Make a function preparing dataset

In [4]:
def prep(df):
    if df is None:
        return 0
    
    temp = df.copy()
    temp = temp[['model', 'car_type', 'fuel_type', 'car_rating', 'year_to_start', 'riders', 'year_to_work']]
    
    # make ohe for columns car_type and fuel_type
    ohe = pd.get_dummies(df[['car_type', 'fuel_type']]).drop(['car_type_business', 'fuel_type_electro'], axis=1)
    temp = pd.concat([temp, ohe], axis=1).drop(['car_type', 'fuel_type'], axis=1)
    
    
    return temp

In [5]:
from sklearn.model_selection import train_test_split

Make standart Train Test Split

In [6]:
X = prep(df)
y = df['target_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Lets take default catboost

In [7]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [8]:
cat_cols = ['model', 'year_to_start', 'year_to_work']
ctb = CatBoostClassifier(iterations=300).fit(X_train, y_train, 
                                             eval_set=(X_test, y_test), 
                                             cat_features=cat_cols, 
                                             early_stopping_rounds=20,                                         
                                             verbose=20)

Learning rate set to 0.173181
0:	learn: 2.1934731	test: 2.1958543	best: 2.1958543 (0)	total: 157ms	remaining: 47.1s
20:	learn: 2.0692537	test: 2.1918308	best: 2.1894681 (14)	total: 1.21s	remaining: 16.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 2.189468085
bestIteration = 14

Shrink model to first 15 iterations.


In [9]:
y_pred = ctb.predict(X_test)

print(f'Accuracy score {accuracy_score(y_test, y_pred)}')

Accuracy score 0.1111111111111111


# Lets Validate

In [10]:
url2 = 'https://raw.githubusercontent.com/a-milenkin/Competitive_Data_Science/main/data/car_test.csv'
df_val = pd.read_csv(url2)

In [11]:
df_val.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work
0,P17494612l,Skoda Rapid,economy,petrol,4.8,2013,42269,2019
1,N-1530212S,Renault Sandero,standart,petrol,4.32,2015,90014,2016
2,B-1154399t,Smart ForTwo,economy,petrol,4.46,2015,82684,2017
3,F12725233R,Smart ForFour,economy,petrol,2.8,2014,68833,2021
4,l-1139189J,Skoda Rapid,economy,petrol,6.56,2013,42442,2021


In [12]:
df_val['target_class'] = ctb.predict(prep(df_val))

result = df_val[['car_id','target_class']]

In [13]:
result.to_csv('subs/01_answer_basic.csv', index=False)

# Adding additional data

In [14]:
url1_2 = 'data/rides_info.csv'
df_rides = pd.read_csv(url1_2)

In [15]:
df_rides.head()

Unnamed: 0,user_id,car_id,ride_id,ride_date,rating,ride_duration,ride_cost,speed_avg,speed_max,stop_times,distance,refueling,user_ride_quality,deviation_normal
0,o52317055h,A-1049127W,b1v,2020-01-01,4.95,21,268,36,113.548538,0,514.24692,0,1.11526,2.909
1,H41298704y,A-1049127W,T1U,2020-01-01,6.91,8,59,36,93.0,1,197.520662,0,1.650465,4.133
2,v88009926E,A-1049127W,g1p,2020-01-02,6.01,20,315,61,81.959675,0,1276.328206,0,2.599112,2.461
3,t14229455i,A-1049127W,S1c,2020-01-02,0.26,19,205,32,128.0,0,535.680831,0,3.216255,0.909
4,W17067612E,A-1049127W,X1b,2020-01-03,1.21,56,554,38,90.0,1,1729.143367,0,2.71655,-1.822


In [16]:
# Make aggegation of additional dataframe
df_rides_agg = df_rides.groupby('car_id', as_index=False).agg(sum_distance = ('distance', 'sum'),
                                                               speed_max = ('speed_max', 'max'),
                                                               user_rate_mean = ('rating', 'mean'),
                                                               user_rate_min = ('rating', 'min'),
                                                               speed_avg_max = ('speed_avg', 'max'),
                                                               speed_avg_mean = ('speed_avg', 'mean'),
                                                               rating_mean = ('user_ride_quality', 'mean'),
                                                               rating_min = ('user_ride_quality', 'min'),
                                                               total_avg_speed = ('speed_avg', 'sum'),
                                                               total_max_speed = ('speed_max', 'sum')
                                                              )

In [17]:
df = pd.merge(df, df_rides_agg, on='car_id')
df.head()

Unnamed: 0,car_id,model,car_type,fuel_type,car_rating,year_to_start,riders,year_to_work,target_reg,target_class,sum_distance,speed_max,user_rate_mean,user_rate_min,speed_avg_max,speed_avg_mean,rating_mean,rating_min,total_avg_speed,total_max_speed
0,y13744087j,Kia Rio X-line,economy,petrol,3.78,2015,76163,2021,9492.96,another_bug,12141310.0,180.855726,4.737759,0.1,77,44.66092,-0.90119,-10.501738,7771,14734.090155
1,O41613818T,VW Polo VI,economy,petrol,3.9,2015,78218,2021,2656.23,electro_bug,18039090.0,187.862734,4.480517,0.0,88,49.862069,14.018105,0.437053,8676,15568.550271
2,d-2109686j,Renault Sandero,standart,petrol,6.3,2012,23340,2017,1526.11,gear_stick,15883660.0,102.382857,4.768391,0.1,73,44.005747,0.722771,-12.535368,7657,11740.406294
3,u29695600e,Mercedes-Benz GLC,business,petrol,4.04,2011,1263,2020,1338.0,engine_fuel,16518830.0,172.793237,3.88092,0.1,88,49.344828,-4.29037,-10.723544,8586,15079.073004
4,N-8915870N,Renault Sandero,standart,petrol,4.7,2012,26428,2017,825.72,engine_fuel,13983170.0,203.462289,4.181149,0.1,89,50.603448,-13.465342,-25.742137,8805,15009.883456


# Lets teach catboost  on updated dataframe

Make function: preparation of dataframe

To optimize dataframe lets use the following types: 
* `int8`  -	Byte (-128 to 127)
* `int16` -	Integer (-32768 to 32767)
* `int32` -	Integer (-2147483648 to 2147483647)
* `int64` -	Integer (-9223372036854775808 to 9223372036854775807)
* `float16` - Half precision float: sign bit, 5 bits exponent, 10 bits mantissa
* `float32`	- Single precision float: sign bit, 8 bits exponent, 23 bits mantissa
* `float64`	- Double precision float: sign bit, 11 bits exponent, 52 bits mantissa

In [18]:
def prep_1(df):
    if df is None:
        return 0
    
    temp = df.copy()
    
    # delete unnecessary columns
    cols = ['car_id', 'target_class', 'target_reg']
    for col in cols:
        if col in df.columns:
            temp = temp.drop(col, axis=1)
    
    # make ohe for columns car_type and fuel_type
    ohe = pd.get_dummies(df[['car_type', 'fuel_type']]).drop(['car_type_business', 'fuel_type_electro'], axis=1)
    temp = pd.concat([temp, ohe], axis=1).drop(['car_type', 'fuel_type'], axis=1)    
    
    temp = temp.dropna()
    
    # optimize df types
    
    temp['model'] = temp['model'].astype('category')
    temp['car_rating'] = temp['car_rating'].astype('float16')
    temp['year_to_start'] = temp['year_to_start'].astype('int16')
    temp['riders'] = temp['riders'].astype('int32')
    temp['year_to_work'] = temp['year_to_work'].astype('int16')
    temp['sum_distance'] = temp['sum_distance'].astype('int32')
    temp['speed_max'] = temp['speed_max'].astype('int16')
    
    return temp

Make standart Train Test Split

In [19]:
X = prep_1(df)
y = df['target_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Using default catboost

In [20]:
cat_cols = ['model', 
            'year_to_start', 
            'year_to_work', 
            'car_type_economy', 
            'car_type_premium', 
            'car_type_standart', 
            'fuel_type_petrol']

ctb = CatBoostClassifier(iterations=300).fit(X_train, y_train, 
                                             eval_set=(X_test, y_test), 
                                             cat_features=cat_cols, 
                                             early_stopping_rounds=20,                                         
                                             verbose=20)

Learning rate set to 0.173181
0:	learn: 1.5052117	test: 1.4863670	best: 1.4863670 (0)	total: 59ms	remaining: 17.6s
20:	learn: 0.3442367	test: 0.3899284	best: 0.3899284 (20)	total: 1.48s	remaining: 19.6s
40:	learn: 0.2766159	test: 0.3482224	best: 0.3482224 (40)	total: 2.83s	remaining: 17.9s
60:	learn: 0.2197693	test: 0.3171467	best: 0.3171467 (60)	total: 4.29s	remaining: 16.8s
80:	learn: 0.1911101	test: 0.3099999	best: 0.3096557 (73)	total: 5.7s	remaining: 15.4s
100:	learn: 0.1624790	test: 0.2944123	best: 0.2944123 (100)	total: 7.12s	remaining: 14s
120:	learn: 0.1443655	test: 0.2888865	best: 0.2886692 (119)	total: 8.55s	remaining: 12.7s
140:	learn: 0.1289645	test: 0.2870533	best: 0.2861430 (135)	total: 9.98s	remaining: 11.3s
160:	learn: 0.1132316	test: 0.2832533	best: 0.2832533 (160)	total: 11.4s	remaining: 9.86s
180:	learn: 0.1032326	test: 0.2811823	best: 0.2808394 (174)	total: 12.8s	remaining: 8.44s
200:	learn: 0.0942988	test: 0.2789796	best: 0.2788061 (199)	total: 14.3s	remaining: 7.

In [21]:
y_pred = ctb.predict(X_test)

print(f'Accuracy score {accuracy_score(y_test, y_pred)}')

Accuracy score 0.8824786324786325


In [22]:
# adding data to validate dataframe
df_val = pd.merge(df_val, df_rides_agg, on='car_id')

In [23]:
result = ctb.predict(prep_1(df_val))

In [24]:
df_val['target_class'] = result

df_val[['car_id','target_class']].to_csv('subs/02_answer_add_data.csv', index=False)

validated acc score: 0.888

# Setting validation

We'll use `KFold` validation with 5 folds 

In [25]:
from sklearn.model_selection import KFold

In [26]:
n_splits = 5
ctbs = []
scores = []

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
for num, (train_index, test_index) in enumerate(kf.split(X)):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ctb = CatBoostClassifier(iterations=300).fit(X_train, y_train, 
                                                 eval_set=(X_test, y_test), 
                                                 cat_features=cat_cols, 
                                                 early_stopping_rounds=20,
                                                 use_best_model=True,
                                                 verbose=False)
    y_pred = ctb.predict(X_test)
    score = round(np.mean(y_test.values == y_pred[:,0]), 3)
    scores.append(score)
    ctbs.append(ctb)
    print(f'Fold {num}, Accuracy {score}')
    
print(f'Overall mean accuracy {np.mean(scores).round(3)}, Std: {np.std(scores).round(3)}')

Fold 0, Accuracy 0.889
Fold 1, Accuracy 0.868
Fold 2, Accuracy 0.854
Fold 3, Accuracy 0.884
Fold 4, Accuracy 0.884
Overall mean accuracy 0.876, Std: 0.013


Lets try `StratifiedKFold` with 5 folds.

In [27]:
from sklearn.model_selection import StratifiedKFold

In [28]:
ctbs = []
scores = []

kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
for num, (train_index, test_index) in enumerate(kf.split(X, y)):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    ctb = CatBoostClassifier(iterations=300).fit(X_train, y_train, 
                                                 eval_set=(X_test, y_test), 
                                                 cat_features=cat_cols, 
                                                 early_stopping_rounds=20,
                                                 use_best_model=True,
                                                 verbose=False)
    y_pred = ctb.predict(X_test)
    score = round(np.mean(y_test.values == y_pred[:,0]), 3)
    scores.append(score)
    ctbs.append(ctb)
    print(f'Fold {num}, Accuracy {score}')
    
print(f'Overall mean accuracy {np.mean(scores).round(3)}, Std: {np.std(scores).round(3)}')

Fold 0, Accuracy 0.895
Fold 1, Accuracy 0.902
Fold 2, Accuracy 0.893
Fold 3, Accuracy 0.882
Fold 4, Accuracy 0.863
Overall mean accuracy 0.887, Std: 0.014


Stratified K-fold gives better perfomance than ordinary K-fold

# Make prediction for test

In [29]:
# make matrix size*n_classes
y_pred = np.zeros((df_val.shape[0], y_train.nunique()))
for num, ctb in enumerate(ctbs):
    y_pred += ctb.predict_proba(prep_1(df_val))

f = lambda x: ctb.classes_[x]
df_val['target_class'] = list(map(f, y_pred.argmax(axis=1)))

In [30]:
df_val[['car_id','target_class']].to_csv('subs/03_answer_validated.csv', index=False)

validated acc score: 0.898 
<br>We get increase of accuracy