In [18]:
import pandas as pd
import numpy as np
from itertools import combinations
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [19]:
df = pd.read_csv('./train_merc.csv')
y = df.y
df.drop(['ID', 'y'], axis = 1, inplace=True)
cat_features_ids = np.where(df.apply(pd.Series.nunique) < 30000)[0].tolist()

In [20]:
pred = [10,10,10,10,10,10,10,10,10,10]
y_real = [10,10,10,10,10,10,10,10,10,100]
print(np.sqrt(mean_squared_error(pred, y_real)))

pred = [25,25,25,25,25,25,25,25,25,25]
y_real = [10,10,10,10,10,10,10,10,10,100]
print(np.sqrt(mean_squared_error(pred, y_real)))

28.460498941515414
27.65863337187866


In [21]:
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.1)
clf = CatBoostRegressor(learning_rate=0.1, iterations=100, random_seed=42, logging_level='Silent')
clf.fit(train, y_train, cat_features=cat_features_ids)
prediction = clf.predict(test)
print('RMSE score:', np.sqrt(mean_squared_error(y_test, prediction)))
print('RMSLE score:', np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(prediction))))

RMSE score: 7.373856282754891
RMSLE score: 0.06776094386349317


In [23]:
df = pd.read_csv('./train_sample.csv.zip')
y = df.is_attributed
df.drop(['click_time', 'attributed_time', 'is_attributed'], axis = 1, inplace=True)
cat_features_ids = np.where(df.apply(pd.Series.nunique) < 30000)[0].tolist()

In [24]:
y_positive = np.ones_like(y)
y_negative = np.zeros_like(y)
print('\t\t $a(x)$ = 1 \t\t\n')
print('Accuracy all positive:', accuracy_score(y, y_positive))
print('Recall all positive:', recall_score(y, y_positive))
print('Precision all positive:', precision_score(y, y_positive))
print('F1 score all positive:', f1_score(y, y_positive))
print('Roc auc score all positive:', roc_auc_score(y, y_positive))
print('\n\n')
print('\t\t $a(x)$ = 0 \t\t\n')
print('Accuracy all negative:', accuracy_score(y, y_negative))
print('Recall all negative:', recall_score(y, y_negative))
print('Precision all negative:', precision_score(y, y_negative))
print('F1 score all negative:', f1_score(y, y_negative))
print('Roc auc score all positive:', roc_auc_score(y, y_negative))

print('\n\n')
print('\t\t Catboost \t\t\n')
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.1)

clf = CatBoostClassifier(learning_rate=0.1, iterations=100, random_seed=42, 
                         eval_metric='AUC', logging_level='Silent', l2_leaf_reg=3, 
                         model_size_reg = 3)
clf.fit(train, y_train, cat_features=cat_features_ids)
prediction = clf.predict_proba(test)

print('Accuracy using Catboost:', accuracy_score(y_test, prediction[:, 1] > 0.5))
print('Recall using Catboost:', recall_score(y_test, prediction[:, 1] > 0.5))
print('Precision using Catboost:', precision_score(y_test, prediction[:, 1] > 0.5))
print('F1 score using Catboost:', f1_score(y_test, prediction[:, 1] > 0.5))
print('Roc auc score using Catboost:', roc_auc_score(y_test, prediction[:, 1]))

		 $a(x)$ = 1 		

Accuracy all positive: 0.00227
Recall all positive: 1.0
Precision all positive: 0.00227
F1 score all positive: 0.004529717541181518
Roc auc score all positive: 0.5



		 $a(x)$ = 0 		

Accuracy all negative: 0.99773
Recall all negative: 0.0
Precision all negative: 0.0
F1 score all negative: 0.0
Roc auc score all positive: 0.5



		 Catboost 		

Accuracy using Catboost: 0.9986
Recall using Catboost: 0.391304347826087
Precision using Catboost: 1.0
F1 score using Catboost: 0.5625
Roc auc score using Catboost: 0.9189287535244104
