In [4]:
import pandas as pd
from catboost import Pool, CatBoostClassifier
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split 

%matplotlib inline

In [11]:
data_path = "./data"
file_data = "train_all.csv"
file_data = os.path.join(data_path, file_data)

In [12]:
data = pd.read_csv(file_data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497819 entries, 0 to 497818
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   pair_id       497819 non-null  int64 
 1   name_1        497819 non-null  object
 2   name_2        497819 non-null  object
 3   is_duplicate  497819 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 15.2+ MB


In [13]:
data.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,4,Powermax Rubber Factory,Co. One,0
4,5,Tress A/S,Longyou Industries Park Zhejiang,0


In [14]:
data.is_duplicate.value_counts()

0    494161
1      3658
Name: is_duplicate, dtype: int64

In [15]:
data["text"] = data["name_1"] + data["name_2"]
data.head()

Unnamed: 0,pair_id,name_1,name_2,is_duplicate,text
0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,Iko Industries Ltd.Enormous Industrial Trade P...
1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,Apcotex Industries Ltd.Technocraft Industries ...
2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0,"Rishichem Distributors Pvt., Ltd.Dsa"
3,4,Powermax Rubber Factory,Co. One,0,Powermax Rubber FactoryCo. One
4,5,Tress A/S,Longyou Industries Park Zhejiang,0,Tress A/SLongyou Industries Park Zhejiang


In [16]:
df_train, df_test = train_test_split(
    data,
    test_size=0.2, 
    stratify=data["is_duplicate"],
    random_state=42
)


In [17]:
df_train.shape, df_test.shape

((398255, 5), (99564, 5))

In [18]:
df_train.is_duplicate.value_counts(), df_test.is_duplicate.value_counts()

(0    395329
 1      2926
 Name: is_duplicate, dtype: int64,
 0    98832
 1      732
 Name: is_duplicate, dtype: int64)

In [23]:
cols = ["text", "is_duplicate"]
df_train.to_csv(os.path.join(data_path, "train_for_catboost.csv"), index=0, columns=cols)
df_test.to_csv(os.path.join(data_path, "test_for_catboost.csv"), index=0, columns=cols)

### Обучим

In [24]:
train = pd.read_csv(os.path.join(data_path, "train_for_catboost.csv"))
train.head()

Unnamed: 0,text,is_duplicate
0,Buddha Metal IndustriesChin Tung Chemical Indu...,0
1,"Tianjin Kuo Cheng Rubber Industry Co., Ltd.R.I...",0
2,"DsiKut Shing Chemical Trading Co., Ltd., Dongg...",0
3,"Edwards India Pvt., Ltd.H & V Advanced Materia...",0
4,Pam Sp AShanghai Jun Yi Imp. & Exp. . Trade Co...,0


In [25]:
def fit_catboost(
    X_train, 
    X_test, 
    y_train, 
    y_test, 
    catboost_params = {},
    verbose = 100
):
    learn_pool = Pool(
        X_train, 
        y_train, 
        text_features=["text"], 
        feature_names=["text"]
    )
    test_pool = Pool(
        X_test, 
        y_test, 
        text_features=["text"],
        feature_names=["text"]
    )
    catboost_default_params = {
        'iterations': 500,
        'learning_rate': 0.05,
        'eval_metric': 'F1',
        'task_type': 'GPU',
        'use_best_model': True
    }
    catboost_default_params.update(catboost_params)
    
    model = CatBoostClassifier(**catboost_default_params)
    model.fit(learn_pool, eval_set=test_pool, verbose=verbose)
    return model



In [27]:
X_train, X_val, y_train, y_val = train_test_split(
    train[["text"]],
    train["is_duplicate"],
    test_size=0.3, 
    stratify=train["is_duplicate"],
    random_state=42
)
cat_boost_model = fit_catboost(X_train, X_val, y_train, y_val)

0:	learn: 0.4365420	test: 0.5831533	best: 0.5831533 (0)	total: 20.4ms	remaining: 10.2s
100:	learn: 0.5170396	test: 0.5908063	best: 0.5908063 (99)	total: 1.25s	remaining: 4.94s
200:	learn: 0.5802025	test: 0.6590422	best: 0.6590422 (200)	total: 2.46s	remaining: 3.65s
300:	learn: 0.6019981	test: 0.6839161	best: 0.6839161 (298)	total: 3.67s	remaining: 2.42s
400:	learn: 0.6315789	test: 0.7019231	best: 0.7019231 (398)	total: 4.94s	remaining: 1.22s
499:	learn: 0.6496577	test: 0.7028886	best: 0.7051546 (486)	total: 6.14s	remaining: 0us
bestTest = 0.7051546392
bestIteration = 486
Shrink model to first 487 iterations.


In [28]:
test = pd.read_csv(os.path.join(data_path, "test_for_catboost.csv"))
test.head()

Unnamed: 0,text,is_duplicate
0,ALPHA TRADINGPİKASAN,0
1,"I B InternationalFootwear Co., Ltd., Dongguan ...",0
2,Mti LlcMarmot Mountain Llc,0
3,Rubber & Plastic Inc.Nad International S A De C V,0
4,V Logistics Inc.Crossmotion Logistics S.A. De ...,0


In [30]:
X_test, y_test = test[["text"]], test["is_duplicate"]
X_test.head()

Unnamed: 0,text
0,ALPHA TRADINGPİKASAN
1,"I B InternationalFootwear Co., Ltd., Dongguan ..."
2,Mti LlcMarmot Mountain Llc
3,Rubber & Plastic Inc.Nad International S A De C V
4,V Logistics Inc.Crossmotion Logistics S.A. De ...


In [32]:
import sklearn.metrics as sk_met # для оценки модели 

y_pred = cat_boost_model.predict(X_test)
print(
    'F1-score на тестовой выборке: {:.3f} \n'
    .format(
        sk_met.f1_score(
            y_test, 
            y_pred, 
            average = 'macro')
    )
)

F1-score на тестовой выборке: 0.858 



In [34]:
y_test.sum()

732

In [35]:
y_pred.sum()

481

In [36]:
y_test.shape

(99564,)

In [38]:

sum(y_test == y_pred)

99221

In [39]:
y_test.shape[0] - sum(y_test == y_pred)

343

In [40]:
y_test.sum() - y_pred.sum()

251