In [1]:
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [2]:
oft_out_file = "../../../data/CS/feature_encodings/OFT/objects.csv"

In [37]:
df_objects = pd.read_csv(oft_out_file)
df_objects = df_objects.set_index('ocel:oid')
df_objects['ocel:type'] = df_objects['ocel:type'].astype('category')
df_objects['oa10'] = df_objects['oa10'].astype('category')

In [38]:
# LightGBM
feature_names = {"cat": ["oa10"], "num": ["oa1", "oa5"]}
flatten = lambda l: [item for sublist in l for item in sublist]
df_objects

Unnamed: 0_level_0,ocel:type,oa1,oa5,oa10
ocel:oid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KRS-197859,krs,-1.129970,-0.090177,Midden- en kleinbedrijf 2
KRS-344127,krs,0.368881,-1.002100,Midden- en kleinbedrijf 2
KRS-9763017,krs,0.368881,-0.481001,Midden- en kleinbedrijf 1
KRS-1011207,krs,1.867732,-0.481001,Midden- en kleinbedrijf 2
KRS-3890065,krs,1.867732,-0.090177,Midden- en kleinbedrijf 2
...,...,...,...,...
KRV-9749381,krv,0.368881,-0.350727,Midden- en kleinbedrijf 2
KRS-9549051,krs,0.368881,-0.090177,Midden- en kleinbedrijf 2
KRV-5150118,krv,0.368881,-0.090177,Midden- en kleinbedrijf 2
KRV-8949856,krv,0.368881,-0.611276,Grootzakelijk


In [40]:
# make train test split
X, y = (
    df_objects.drop(["ocel:type"], axis=1),
    df_objects.loc[:, "ocel:type"].replace({"krs": 0, "krv": 1, "cv": 2}),
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=0
)

train_data = lgb.Dataset(
    X_train,
    label=y_train,
    feature_name=flatten(feature_names.values()),
    categorical_feature=feature_names["cat"],
)
valid_data = lgb.Dataset(
    X_valid,
    label=y_valid,
    feature_name=flatten(feature_names.values()),
    categorical_feature=feature_names["cat"],
)

In [41]:
params = {
    "objective": "multiclass",
    "num_class": 3,
    "metric": ["multi_logloss"],
}
bst = lgb.train(
    params,
    train_data,
    categorical_feature=feature_names['cat'],
    # num_boost_round=5000,
    valid_sets=[valid_data],
    # callbacks=[lgb.early_stopping(50)],
)



ValueError: Series.dtypes must be int, float or bool

In [19]:
def get_predictions(lst):
    return max(range(len(lst)), key=lst.__getitem__)

y_train_preds = bst.predict(X_train)
y_valid_preds = bst.predict(X_valid)
y_train_preds = np.apply_along_axis(get_predictions, axis=1, arr=y_train_preds)
y_valid_preds = np.apply_along_axis(get_predictions, axis=1, arr=y_valid_preds)

In [20]:

train_mse_loss = metrics.accuracy_score(y_train, y_train_preds)
valid_mse_loss = metrics.accuracy_score(y_valid, y_valid_preds)

print(f"Training accuracy: {train_mse_loss}")
print(f"Validation accuracy: {valid_mse_loss}")

Training accuracy: 0.5999809253585436
Validation accuracy: 0.6005436078393973


In [21]:
metrics.confusion_matrix(y_valid,y_valid_preds)

array([[10731,  1480,     0],
       [ 6461,  1863,     0],
       [  304,   132,     0]])

In [26]:
df_objects['oa10'].value_counts()

Particulieren                 50007
Midden- en kleinbedrijf 2     31013
Grootzakelijk                  8727
Midden- en kleinbedrijf 1      6364
Private Banking 2              3622
Private Banking 1              2791
Grootzakelijk Grootbedrijf      761
Bijzonder Beheer Bedrijven       72
Name: oa10, dtype: int64