In [1]:
import numpy as np
import pandas as pd
import catboost as cbt
from sklearn.metrics import accuracy_score

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

%pwd

'/Users/dongzheng/Codes/PythonWorkspace/CSE5005-Project/catboost_model'

In [2]:
def gen_xy(sequence):
    """
    Returns
    ---
    x: (num_samples, 5)
    y: (num_samples,) 1-d vec for labels
    """

    x, y = [], []
    for i in range(len(sequence) - 2):
        x.append(sequence[i : i + 2])
        y.append(sequence[i + 1])

    x = np.array(x)
    x = x.reshape(x.shape[0], x.shape[1] * x.shape[2])
    x = x[:, :5]
    y = np.array(y)

    if len(y.shape) > 1:
        y = y[:, -1]

    return x, y

sequence = pd.read_pickle("../data/data_100000_distr.pkl")[
    ["from_user_id", "to_user_id", "label"]
].values

x, y=gen_xy(sequence)

x.shape
y.shape

(99998, 5)

(99998,)

In [3]:
train_size=0.7
val_size=0.1
cat_feat_index=[0, 1, 2, 3, 4]

split1 = int(len(x) * train_size)
split2 = int(len(sequence) * (train_size + val_size))

x_train, y_train = x[:split1], y[:split1]
x_val, y_val = x[split1:split2], y[split1:split2]
x_test, y_test = x[split2:], y[split2:]

print(f"Trainset:\tx-{x_train.shape}\ty-{y_train.shape}")
print(f"Valset:  \tx-{x_val.shape}  \ty-{y_val.shape}")
print(f"Testset:\tx-{x_test.shape}\ty-{y_test.shape}")

trainset=cbt.Pool(data=x_train, label=y_train, cat_features=cat_feat_index)
valset=cbt.Pool(data=x_val, label=y_val, cat_features=cat_feat_index)
testset=cbt.Pool(data=x_test, label=y_test, cat_features=cat_feat_index)

Trainset:	x-(69998, 5)	y-(69998,)
Valset:  	x-(10002, 5)  	y-(10002,)
Testset:	x-(19998, 5)	y-(19998,)


In [70]:
cbt_params = {
    "iterations": 5000,
    "early_stopping_rounds": 100,
    "learning_rate": 0.05,
    "random_seed": 510,
    "loss_function": "CrossEntropy",
    
    "od_type": "Iter",
    
    "max_depth": 6,
    # "l2_leaf_reg": 100,
    # "subsample": 0.8,
    
    # "min_data_in_leaf": 10,
    # "leaf_estimation_method": "Exact",
    # "grow_policy": "Depthwise",
    # "bootstrap_type": "Poisson",
    # "max_leaves": 30,
    
    # "thread_count": 24,
    "verbose": 100,
    # "task_type": "GPU"
}

In [71]:
model=cbt.CatBoostClassifier(**cbt_params)

model.fit(trainset, eval_set=valset)

0:	learn: 0.6748238	test: 0.6714616	best: 0.6714616 (0)	total: 21.5ms	remaining: 1m 47s
100:	learn: 0.4657882	test: 0.4145051	best: 0.4145051 (100)	total: 1.71s	remaining: 1m 22s
200:	learn: 0.4578758	test: 0.4081163	best: 0.4081163 (200)	total: 3.37s	remaining: 1m 20s
300:	learn: 0.4534900	test: 0.4062383	best: 0.4062340 (299)	total: 4.92s	remaining: 1m 16s
400:	learn: 0.4500548	test: 0.4054174	best: 0.4054112 (396)	total: 6.44s	remaining: 1m 13s
500:	learn: 0.4470739	test: 0.4048188	best: 0.4048188 (500)	total: 7.95s	remaining: 1m 11s
600:	learn: 0.4442287	test: 0.4044686	best: 0.4044618 (593)	total: 9.45s	remaining: 1m 9s
700:	learn: 0.4417441	test: 0.4042827	best: 0.4042724 (693)	total: 11s	remaining: 1m 7s
800:	learn: 0.4393144	test: 0.4041263	best: 0.4040939 (785)	total: 12.4s	remaining: 1m 5s
900:	learn: 0.4371743	test: 0.4041350	best: 0.4040672 (876)	total: 13.9s	remaining: 1m 3s
1000:	learn: 0.4350787	test: 0.4042275	best: 0.4040469 (930)	total: 15.4s	remaining: 1m 1s
Stopped 

<catboost.core.CatBoostClassifier at 0x146f6fca0>

In [72]:
y_train_pred=model.predict(x_train, verbose=True)
y_val_pred=model.predict(x_val, verbose=True)
y_pred=model.predict(x_test, verbose=True)

accuracy_score(y_train, y_train_pred)
accuracy_score(y_val, y_val_pred)
accuracy_score(y_test, y_pred)

0.8858538815394725

0.8095380923815237

0.7532253225322533

In [6]:
from sklearn.svm import SVC

svc=SVC()
svc.fit(x_train, y_train)

y_pred=svc.predict(x_test)
accuracy_score(y_test, y_pred)

0.4963496349634963