In [1]:
import pandas as pd

df_train = pd.read_csv('../data/test_data_stg.csv', delimiter='|')
df_test = pd.read_csv('../data/train_data_stg.csv', delimiter='|')

In [2]:
df = pd.concat([df_train, df_test], ignore_index=True)

df

Unnamed: 0,rank,t,p
0,25,32625522.00,25408.0
1,30,9678145.70,68264.0
2,53,3453252.19,3437.0
3,65,2727246.97,12729.0
4,66,1161085.64,2417.0
...,...,...,...
324600,324599,26416.67,400.0
324601,324600,161716.50,177.0
324602,324602,235911.66,179.0
324603,324604,81000.00,4737.0


In [3]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)

In [4]:
train_data = train_data.drop(['rank'], axis=1)
test_data = test_data.drop(['rank'], axis=1)

In [5]:
import numpy as np

train_data = np.log(train_data)
test_data = np.log(test_data)

## Ydf 

In [6]:
import ydf

gbtf = ydf.GradientBoostedTreesLearner(label='p', task=ydf.Task.REGRESSION, num_trees=1000).train(train_data)

prediction = gbtf.predict(test_data)

Train model on 243453 examples


Model trained in 0:00:10.302938


In [7]:
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

print("Mean Absolute Error (MAE): %.2f" % mean_absolute_error(test_data[['p']], prediction))
print("Root Mean Squarred Error (RMSE): %.2f" % root_mean_squared_error(test_data[['p']], prediction))
print("R2 Score %.2f" % r2_score(test_data[['p']], prediction))

Mean Absolute Error (MAE): 1.11
Root Mean Squarred Error (RMSE): 1.46
R2 Score 0.45


## CatBoost

In [10]:
from catboost import CatBoostRegressor

cbr = CatBoostRegressor(
    loss_function='RMSE',
    iterations=1000
    
)
cbr.fit(train_data[['t']], train_data['p'])

prediction = cbr.predict(test_data[['t']])

Learning rate set to 0.097551
0:	learn: 1.8909965	total: 16.4ms	remaining: 16.4s
1:	learn: 1.8215970	total: 30.6ms	remaining: 15.3s
2:	learn: 1.7621070	total: 44ms	remaining: 14.6s
3:	learn: 1.7121713	total: 56.1ms	remaining: 14s
4:	learn: 1.6694213	total: 66.1ms	remaining: 13.2s
5:	learn: 1.6342365	total: 78.8ms	remaining: 13.1s
6:	learn: 1.6047657	total: 90ms	remaining: 12.8s
7:	learn: 1.5797139	total: 100ms	remaining: 12.4s
8:	learn: 1.5588976	total: 109ms	remaining: 12s
9:	learn: 1.5413538	total: 119ms	remaining: 11.8s
10:	learn: 1.5268196	total: 132ms	remaining: 11.9s
11:	learn: 1.5149260	total: 141ms	remaining: 11.6s
12:	learn: 1.5047698	total: 150ms	remaining: 11.4s
13:	learn: 1.4964531	total: 159ms	remaining: 11.2s
14:	learn: 1.4898124	total: 170ms	remaining: 11.2s
15:	learn: 1.4843076	total: 181ms	remaining: 11.1s
16:	learn: 1.4797430	total: 190ms	remaining: 11s
17:	learn: 1.4758362	total: 199ms	remaining: 10.9s
18:	learn: 1.4724794	total: 208ms	remaining: 10.8s
19:	learn: 1.4

In [9]:
print("Mean Absolute Error (MAE): %.2f" % mean_absolute_error(test_data[['p']], prediction))
print("Root Mean Squarred Error (RMSE): %.2f" % root_mean_squared_error(test_data[['p']], prediction))
print("R2 Score %.2f" % r2_score(test_data[['p']], prediction))

Mean Absolute Error (MAE): 1.11
Root Mean Squarred Error (RMSE): 1.46
R2 Score 0.45


## Gradient Boost

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(
    max_depth=4, 
    learning_rate=0.01, 
    n_estimators=10000,
    random_state=42,
    n_iter_no_change=50
    )
gbrt.fit(train_data[['t']], train_data['p'])

prediction = gbrt.predict(test_data[['t']])

In [14]:
print("Mean Absolute Error (MAE): %.2f" % mean_absolute_error(test_data[['p']], prediction))
print("Root Mean Squarred Error (RMSE): %.2f" % root_mean_squared_error(test_data[['p']], prediction))
print("R2 Score %.2f" % r2_score(test_data[['p']], prediction))

Mean Absolute Error (MAE): 1.11
Root Mean Squarred Error (RMSE): 1.46
R2 Score 0.45


## Ada Boost

In [15]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

ada_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth=3), n_estimators=500,
    learning_rate=0.01, random_state=42
)
ada_reg.fit(train_data[['t']], train_data['p'])

prediction = ada_reg.predict(test_data[['t']])

In [16]:
print("Mean Absolute Error (MAE): %.2f" % mean_absolute_error(test_data[['p']], prediction))
print("Root Mean Squarred Error (RMSE): %.2f" % root_mean_squared_error(test_data[['p']], prediction))
print("R2 Score %.2f" % r2_score(test_data[['p']], prediction))

Mean Absolute Error (MAE): 1.12
Root Mean Squarred Error (RMSE): 1.47
R2 Score 0.44
