# Colab setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd ./drive/Othercomputers/MacBook/Earth/module/dd_earthquake/book

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Othercomputers/MacBook/Earth/module/dd_earthquake/book


In [2]:
%ls

bench_mark.ipynb  Cat_colab.ipynb             learn_categorical_plots.ipynb
[0m[01;34mcatboost_info[0m/    colab_lgbt_multigrid.ipynb  Light_GBM.ipynb
Catboost.ipynb    edm_data.ipynb


In [3]:
!pip install CatBoost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import

In [4]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('../')

In [5]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier

from src import common

# data

In [6]:
train_values, train_labels = common.file.read_data('train')

# preprocessing

In [7]:
features_list = ['geo_level_1_id', 
                 'geo_level_2_id', 
                 'geo_level_3_id',
                 'age', 
                 'area_percentage', 
                 'height_percentage', 
                 'foundation_type', 
                 'roof_type', 
                 'ground_floor_type', 
                 'other_floor_type', 
                 'position', 
                 'has_superstructure_mud_mortar_stone', 
                 'has_superstructure_cement_mortar_brick', 
                 'has_superstructure_timber', 
                 'count_families', 
                 'has_secondary_use'
                ]

In [10]:
train_values, train_labels = common.lgbm_preprocessing((train_values, train_labels), mode='train', features_list=features_list)

In [13]:
cat_features = list(train_values.select_dtypes(include='category').columns)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_values, train_labels,
                                                        test_size=0.1, random_state=19, stratify=train_labels)

In [15]:
#input best params here from randm.best_params_
model = CatBoostClassifier(eval_metric='TotalF1',random_seed=19, silent=False, task_type='GPU',  
                           cat_features=cat_features, iterations=20000, learning_rate=0.05, 
                           classes_count=3, early_stopping_rounds=1000)
model.fit(X_train, y_train)
preds=model.predict(X_test)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
15000:	learn: 0.7930522	total: 2m 7s	remaining: 42.4s
15001:	learn: 0.7930595	total: 2m 7s	remaining: 42.4s
15002:	learn: 0.7930475	total: 2m 7s	remaining: 42.4s
15003:	learn: 0.7930429	total: 2m 7s	remaining: 42.4s
15004:	learn: 0.7930384	total: 2m 7s	remaining: 42.4s
15005:	learn: 0.7930584	total: 2m 7s	remaining: 42.4s
15006:	learn: 0.7930398	total: 2m 7s	remaining: 42.4s
15007:	learn: 0.7930702	total: 2m 7s	remaining: 42.4s
15008:	learn: 0.7930386	total: 2m 7s	remaining: 42.4s
15009:	learn: 0.7930622	total: 2m 7s	remaining: 42.4s
15010:	learn: 0.7930692	total: 2m 7s	remaining: 42.4s
15011:	learn: 0.7931005	total: 2m 7s	remaining: 42.4s
15012:	learn: 0.7930910	total: 2m 7s	remaining: 42.3s
15013:	learn: 0.7930491	total: 2m 7s	remaining: 42.3s
15014:	learn: 0.7930560	total: 2m 7s	remaining: 42.3s
15015:	learn: 0.7930875	total: 2m 7s	remaining: 42.3s
15016:	learn: 0.7930829	total: 2m 7s	remaining: 42.3s
15017:	learn: 0.7

In [17]:
preds=model.predict(X_test)

#f1 score is the metric used in the competition
accuracy=f1_score(y_test,preds,average='micro')
print("Score on test set")
print("\n ========================================================")
print(accuracy)
print('\n Benchmark model was: 0.5815')


Score on test set

0.7404550861440466

 Benchmark model was: 0.5815


# pred and submit

In [18]:
test_values = common.file.read_data('test')
test_values, _ = common.lgbm_preprocessing(test_values, mode='test', features_list=features_list)
y_test_prob = model.predict(test_values)
y_test = np.argmax(y_test_prob, axis=1)
submission_format = common.file.read_data('submission')
my_submission = pd.DataFrame(data=y_test+1,
                             columns=submission_format.columns,
                             index=submission_format.index)