In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
from sentence_transformers import SentenceTransformer,util

In [72]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [90]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [5]:
data = pd.read_csv("./Data/12345DDF_CLEAN_TEXT_TOP_10_HISTORICAL_TICKET_oldJSON.csv")

In [6]:
data.shape

(39767, 14)

In [7]:
data.columns

Index(['Unnamed: 0', 'CD_TICKET_CODE', 'CD_REPAIRER_LANGUAGE_ISO_CODE',
       'DS_REPAIRER_LANGUAGE_TICKET_ADDITIONAL_DESC',
       'ID_HISTORICAL_TICKET_ID', 'Qualification (complete)', 'Pole', 'FQT',
       'clean_text_1', 'pole', 'quali', 'fqt', 'clean_text_2', 'clean_text_3'],
      dtype='object')

In [8]:
data.isnull().sum()

Unnamed: 0                                      0
CD_TICKET_CODE                                  0
CD_REPAIRER_LANGUAGE_ISO_CODE                   0
DS_REPAIRER_LANGUAGE_TICKET_ADDITIONAL_DESC     0
ID_HISTORICAL_TICKET_ID                         0
Qualification (complete)                        0
Pole                                            0
FQT                                             0
clean_text_1                                   21
pole                                            0
quali                                           0
fqt                                             0
clean_text_2                                   37
clean_text_3                                   37
dtype: int64

In [9]:
data.dropna(inplace=True)

In [10]:
data.shape

(39730, 14)

In [11]:
data.columns

Index(['Unnamed: 0', 'CD_TICKET_CODE', 'CD_REPAIRER_LANGUAGE_ISO_CODE',
       'DS_REPAIRER_LANGUAGE_TICKET_ADDITIONAL_DESC',
       'ID_HISTORICAL_TICKET_ID', 'Qualification (complete)', 'Pole', 'FQT',
       'clean_text_1', 'pole', 'quali', 'fqt', 'clean_text_2', 'clean_text_3'],
      dtype='object')

In [12]:
data_required = data[['CD_TICKET_CODE','pole','quali','fqt','clean_text_3']]

### OHE for categorical columns

In [22]:
enc = OneHotEncoder()

In [24]:
data_required.nunique()

CD_TICKET_CODE    39730
pole                  7
quali               159
fqt                  77
clean_text_3      36993
dtype: int64

In [28]:
pole_encoding = enc.fit_transform(data_required[['pole']]).toarray()

In [31]:
enc.categories_

[array(['chbo', 'diag', 'doc', 'eecv', 'gdep', 'ghec', 'sb'], dtype=object)]

In [32]:
enc2 = OneHotEncoder()

In [33]:
quali_encoding = enc2.fit_transform(data_required[['quali']]).toarray()

In [34]:
enc3 = OneHotEncoder()

In [35]:
fqt_encoding = enc3.fit_transform(data_required[['fqt']]).toarray()

In [41]:
cat_array = np.concatenate((pole_encoding, quali_encoding,fqt_encoding), axis=1)

In [42]:
cat_array.shape

(39730, 243)

In [44]:
cat_cols_encodeslst = []

In [45]:
cat_cols_encodeslst.extend(['P_'+str(i) for i in range(7)])
cat_cols_encodeslst.extend(['Q_'+str(i) for i in range(159)])
cat_cols_encodeslst.extend(['F_'+str(i) for i in range(77)])

In [46]:
len(cat_cols_encodeslst)

243

In [47]:
emb_cat = pd.DataFrame(cat_array, columns=cat_cols_encodeslst)

In [49]:
emb_cat['CD_TICKET_CODE']=data_required['CD_TICKET_CODE'].copy()

In [51]:
#emb_cat.to_csv("./Embeddings/emb_categorical.csv")

### Model 1 : paraphrase-multilingual-MiniLM-L12-v2

In [4]:
model1 = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [17]:
corpus1 = data_required['clean_text_3'].to_list()

In [52]:
embedding1 = model1.encode(corpus1,show_progress_bar=True)

Batches:   0%|          | 0/1242 [00:00<?, ?it/s]

In [53]:
emb_text1 = pd.DataFrame(embedding1,columns=['S_'+str(i) for i in range(384)])

In [54]:
emb_text1['CD_TICKET_CODE']=data_required['CD_TICKET_CODE'].copy()

In [56]:
emb_text1.to_csv("./Embeddings/emb_text1.csv")

In [57]:
emb_text1

Unnamed: 0,S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_8,S_9,...,S_375,S_376,S_377,S_378,S_379,S_380,S_381,S_382,S_383,CD_TICKET_CODE
0,0.038719,0.166989,-0.020895,-0.020033,0.018764,-0.121065,0.322609,0.060488,-0.013489,0.039179,...,-0.072981,0.088391,0.049950,-0.215446,0.172824,0.047992,-0.110565,0.157034,0.143518,15127260.0
1,-0.116909,0.141321,-0.120040,-0.142872,0.010789,-0.043393,0.372332,0.068656,-0.166772,-0.006856,...,0.027697,0.160696,-0.156869,0.045663,0.291735,-0.077159,0.233958,0.012415,0.019386,13512177.0
2,-0.131309,0.197272,-0.175723,-0.011681,0.172575,-0.045742,0.213583,0.011284,0.084631,-0.183520,...,0.016953,-0.040434,-0.086093,-0.092723,0.097305,-0.114028,0.224396,0.140403,0.083872,14243438.0
3,-0.081782,0.102429,-0.015618,-0.129171,-0.043245,-0.042198,-0.054128,0.210862,-0.100120,0.105066,...,-0.327077,-0.175971,0.185393,0.053562,-0.003288,-0.106276,0.336823,0.070777,-0.082403,12387103.0
4,-0.098286,0.283100,0.007408,-0.125432,-0.087529,-0.193577,-0.000470,0.177315,0.066710,0.071587,...,-0.261716,-0.178521,-0.177009,0.051879,0.069604,-0.186850,0.177883,-0.044516,0.102267,12983505.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39725,-0.054549,0.170576,-0.178535,-0.190375,-0.251395,-0.100667,-0.362783,-0.048697,-0.291900,-0.193179,...,0.084983,-0.103559,-0.108693,-0.037309,-0.051086,-0.300938,0.206633,-0.323588,0.105633,10247209.0
39726,-0.280866,0.155575,-0.012592,-0.184807,-0.092238,-0.129203,-0.107878,0.062089,-0.055971,-0.028855,...,-0.250107,-0.023748,-0.067684,-0.130449,0.291489,-0.203542,0.111510,0.082959,0.124909,13003151.0
39727,-0.149251,0.191638,0.062686,-0.157634,-0.000242,-0.101876,0.198197,0.088623,-0.126824,0.071157,...,-0.133725,-0.067622,-0.023484,-0.012686,0.074284,-0.044050,0.125224,0.119888,0.139422,13147462.0
39728,-0.225650,0.266555,0.131707,0.149876,-0.016064,-0.181253,0.212904,0.173323,-0.086320,-0.022643,...,-0.128420,0.069382,0.058816,-0.051238,0.077373,0.029482,-0.181806,0.164547,0.205540,13081002.0


### Model 2 : xlm-roberta-large

In [5]:
model2 = SentenceTransformer('xlm-roberta-large')

No sentence-transformers model found with name C:\Users\v.sai.teja.kukunuri/.cache\torch\sentence_transformers\xlm-roberta-large. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\v.sai.teja.kukunuri/.cache\torch\sentence_transformers\xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [94]:
corpus2 = data_required['clean_text_3'].to_list()

In [95]:
embedding2 = model2.encode(corpus2,show_progress_bar=True)

Batches:   0%|          | 0/1242 [00:00<?, ?it/s]

In [96]:
embedding2.shape

(39730, 1024)

In [98]:
emb_text2 = pd.DataFrame(embedding2,columns=['S_'+str(i) for i in range(1024)])

In [99]:
emb_text2['CD_TICKET_CODE']=data_required['CD_TICKET_CODE'].copy()

In [100]:
emb_text2.to_csv("./Embeddings/emb_text2.csv")

### Model 3 : multilingual-e5-large

In [6]:
model3 = SentenceTransformer('intfloat/multilingual-e5-large')

In [116]:
corpus3 = data_required['clean_text_3'].to_list()

In [117]:
embedding3 = model3.encode(corpus3,show_progress_bar=True)

Batches:   0%|          | 0/1242 [00:00<?, ?it/s]

In [118]:
embedding3.shape

(39730, 1024)

In [119]:
emb_text3 = pd.DataFrame(embedding3,columns=['S_'+str(i) for i in range(embedding3.shape[1])])

In [120]:
emb_text3['CD_TICKET_CODE']=data_required['CD_TICKET_CODE'].copy()

In [121]:
emb_text3.to_csv("./Embeddings/emb_text3.csv")

### Model 4 : xlm-r-distilroberta-base-paraphrase-v1

In [7]:
model4 = SentenceTransformer('sentence-transformers/xlm-r-distilroberta-base-paraphrase-v1')

In [135]:
corpus4 = data_required['clean_text_3'].to_list()

In [136]:
embedding4 = model4.encode(corpus4,show_progress_bar=True)

Batches:   0%|          | 0/1242 [00:00<?, ?it/s]

In [137]:
embedding4.shape

(39730, 768)

In [138]:
emb_text4 = pd.DataFrame(embedding4,columns=['S_'+str(i) for i in range(embedding4.shape[1])])

In [139]:
emb_text4['CD_TICKET_CODE']=data_required['CD_TICKET_CODE'].copy()

In [140]:
emb_text4.to_csv("./Embeddings/emb_text4.csv")

model1 - 30 mins, model2 - 2hr 42mins, model3 - 2hr 21mins model4 - 1hr 11mins

### Classification

#### catboost

In [41]:
emb_cat = pd.read_csv("./Embeddings/emb_categorical.csv")

In [42]:
emb_text1 = pd.read_csv("./Embeddings/emb_text1.csv")

In [43]:
emb_cat.shape, emb_text1.shape

((39730, 245), (39730, 386))

In [44]:
emb_cat.drop(columns = ['Unnamed: 0'],inplace=True)
emb_text1.drop(columns = ['Unnamed: 0'],inplace=True)

In [45]:
emb_cat.reset_index(drop=True,inplace=True)
emb_text1.reset_index(drop=True,inplace=True)

In [53]:
emb_cat.shape, emb_text1.shape

((39730, 244), (39730, 385))

In [51]:
df1 = pd.concat([emb_text1,emb_cat],axis=1)

In [52]:
df1.shape

(39730, 629)

In [55]:
data.shape

(39730, 14)

In [61]:
final_data = df1.copy()

In [63]:
final_data['ID_HISTORICAL_TICKET_ID'] = data['ID_HISTORICAL_TICKET_ID']

In [65]:
final_data.drop(columns=['CD_TICKET_CODE'],inplace=True)

In [80]:
final_data.isnull().sum()

S_0                         0
S_1                         0
S_2                         0
S_3                         0
S_4                         0
                           ..
F_73                        0
F_74                        0
F_75                        0
F_76                        0
ID_HISTORICAL_TICKET_ID    37
Length: 628, dtype: int64

In [82]:
final_data.dropna(inplace=True)

In [83]:
final_data.shape

(39693, 628)

In [84]:
159+77+7+384+1

628

In [85]:
X1 = final_data.drop(columns = ['ID_HISTORICAL_TICKET_ID'])
y1 = final_data[['ID_HISTORICAL_TICKET_ID']]
print("Shape of X is %s and shape \
    of y is %s" % (X1.shape, y1.shape))

Shape of X is (39693, 627) and shape     of y is (39693, 1)


In [86]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(
    X1, y1, test_size=0.25, random_state=42)

In [87]:
X_train1, X_val1, Y_train1, Y_val1 = train_test_split(
    X_train1, Y_train1, test_size=0.25, random_state=42)

In [88]:

params = {
#             'learning_rate': 0.1,
#           'l2_leaf_reg': 3,
#           'iterations': 10,
          'random_seed': 42,
          'verbose':True,
#           'depth': 10,
          'use_best_model': True,
         }

catboost1 = CatBoostClassifier(**params)

In [89]:
catboost1.fit(X_train1, Y_train1, eval_set=(X_val1, Y_val1))

Learning rate set to 0.114896
0:	learn: 2.2755236	test: 2.2774387	best: 2.2774387 (0)	total: 3.21s	remaining: 53m 24s
1:	learn: 2.2541275	test: 2.2582143	best: 2.2582143 (1)	total: 6.3s	remaining: 52m 23s
2:	learn: 2.2375178	test: 2.2434000	best: 2.2434000 (2)	total: 9.55s	remaining: 52m 53s
3:	learn: 2.2244637	test: 2.2318490	best: 2.2318490 (3)	total: 12.6s	remaining: 52m 23s
4:	learn: 2.2136189	test: 2.2227921	best: 2.2227921 (4)	total: 15.8s	remaining: 52m 21s
5:	learn: 2.2046917	test: 2.2154973	best: 2.2154973 (5)	total: 19s	remaining: 52m 35s
6:	learn: 2.1970323	test: 2.2097957	best: 2.2097957 (6)	total: 22.3s	remaining: 52m 43s
7:	learn: 2.1902741	test: 2.2054464	best: 2.2054464 (7)	total: 26.6s	remaining: 55m 1s
8:	learn: 2.1840368	test: 2.2021284	best: 2.2021284 (8)	total: 29.6s	remaining: 54m 20s
9:	learn: 2.1792611	test: 2.1994349	best: 2.1994349 (9)	total: 32.6s	remaining: 53m 49s
10:	learn: 2.1747606	test: 2.1970154	best: 2.1970154 (10)	total: 35.3s	remaining: 52m 50s
11:	

<catboost.core.CatBoostClassifier at 0x1763b4f3820>

In [91]:
def scoring(model, x_train, x_test, y_train, y_test):
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    print('TEST RESULT')
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred_test))
    print('Recall: %.3f' % recall_score(y_test, y_pred_test,average= 'macro'))
    print('Precision: %.3f' % precision_score(y_test, y_pred_test,average= 'macro'))
    print('F1 Score: %.3f' % f1_score(y_test, y_pred_test,average= 'macro'))
    print('TRAIN RESULTS')
    print('Accuracy: %.3f' % accuracy_score(y_train, y_pred_train))
    print('Recall: %.3f' % recall_score(y_train, y_pred_train,average= 'macro'))
    print('Precision: %.3f' % precision_score(y_train, y_pred_train,average= 'macro'))
    print('F1 Score: %.3f' % f1_score(y_train, y_pred_train,average= 'macro'))

In [92]:
scoring(catboost1,X_train1,X_test1,Y_train1,Y_test1)

TEST RESULT
Accuracy: 0.193
Recall: 0.101
Precision: 0.051
F1 Score: 0.045
TRAIN RESULTS
Accuracy: 0.215
Recall: 0.114
Precision: 0.087
F1 Score: 0.061


In [101]:
df2 = pd.concat([emb_text2,emb_cat],axis=1)

In [102]:
final_data2 = df2.copy()

In [104]:
final_data2['ID_HISTORICAL_TICKET_ID'] = data['ID_HISTORICAL_TICKET_ID']

In [105]:
final_data2.drop(columns=['CD_TICKET_CODE'],inplace=True)

In [106]:
final_data2.dropna(inplace=True)

In [107]:
final_data2.shape

(39693, 1268)

In [108]:
159+77+7+1024+1

1268

In [109]:
X2 = final_data2.drop(columns = ['ID_HISTORICAL_TICKET_ID'])
y2 = final_data2[['ID_HISTORICAL_TICKET_ID']]
print("Shape of X is %s and shape \
    of y is %s" % (X2.shape, y2.shape))

Shape of X is (39693, 1267) and shape     of y is (39693, 1)


In [110]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(
    X2, y2, test_size=0.25, random_state=42)

In [111]:
X_train2, X_val2, Y_train2, Y_val2 = train_test_split(
    X_train2, Y_train2, test_size=0.25, random_state=42)

In [112]:
params = {
#             'learning_rate': 0.1,
#           'l2_leaf_reg': 3,
#           'iterations': 10,
          'random_seed': 42,
          'verbose':True,
#           'depth': 10,
          'use_best_model': True,
         }

catboost2 = CatBoostClassifier(**params)

In [113]:
catboost2.fit(X_train2, Y_train2, eval_set=(X_val2, Y_val2))

Learning rate set to 0.114896
0:	learn: 2.2755074	test: 2.2773298	best: 2.2773298 (0)	total: 4.37s	remaining: 1h 12m 46s
1:	learn: 2.2545784	test: 2.2581183	best: 2.2581183 (1)	total: 8.01s	remaining: 1h 6m 36s
2:	learn: 2.2377482	test: 2.2434165	best: 2.2434165 (2)	total: 11.8s	remaining: 1h 5m 13s
3:	learn: 2.2245786	test: 2.2319031	best: 2.2319031 (3)	total: 15.4s	remaining: 1h 4m 5s
4:	learn: 2.2137449	test: 2.2228621	best: 2.2228621 (4)	total: 19.3s	remaining: 1h 4m 6s
5:	learn: 2.2056800	test: 2.2156291	best: 2.2156291 (5)	total: 22.9s	remaining: 1h 3m 11s
6:	learn: 2.1980842	test: 2.2100025	best: 2.2100025 (6)	total: 26.7s	remaining: 1h 3m 8s
7:	learn: 2.1919039	test: 2.2056306	best: 2.2056306 (7)	total: 30.4s	remaining: 1h 2m 49s
8:	learn: 2.1867026	test: 2.2024288	best: 2.2024288 (8)	total: 34s	remaining: 1h 2m 27s
9:	learn: 2.1821907	test: 2.1999190	best: 2.1999190 (9)	total: 37.7s	remaining: 1h 2m 11s
10:	learn: 2.1779146	test: 2.1977915	best: 2.1977915 (10)	total: 41.4s	rem

<catboost.core.CatBoostClassifier at 0x1763a0be560>

In [114]:
scoring(catboost2,X_train2,X_test2,Y_train2,Y_test2)

TEST RESULT
Accuracy: 0.194
Recall: 0.101
Precision: 0.055
F1 Score: 0.042
TRAIN RESULTS
Accuracy: 0.209
Recall: 0.109
Precision: 0.091
F1 Score: 0.052


In [122]:
df3 = pd.concat([emb_text3,emb_cat],axis=1)

In [123]:
final_data3 = df3.copy()

In [124]:
final_data3['ID_HISTORICAL_TICKET_ID'] = data['ID_HISTORICAL_TICKET_ID']

In [125]:
final_data3.drop(columns=['CD_TICKET_CODE'],inplace=True)

In [126]:
final_data3.dropna(inplace=True)

In [127]:
final_data3.shape

(39693, 1268)

In [128]:
X3 = final_data3.drop(columns = ['ID_HISTORICAL_TICKET_ID'])
y3 = final_data3[['ID_HISTORICAL_TICKET_ID']]
print("Shape of X is %s and shape \
    of y is %s" % (X3.shape, y3.shape))

Shape of X is (39693, 1267) and shape     of y is (39693, 1)


In [129]:
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(
    X3, y3, test_size=0.25, random_state=42)

In [130]:
X_train3, X_val3, Y_train3, Y_val3 = train_test_split(
    X_train3, Y_train3, test_size=0.25, random_state=42)

In [131]:
params = {
#             'learning_rate': 0.1,
#           'l2_leaf_reg': 3,
#           'iterations': 10,
          'random_seed': 42,
          'verbose':True,
#           'depth': 10,
          'use_best_model': True,
         }

catboost3 = CatBoostClassifier(**params)

In [132]:
catboost3.fit(X_train3, Y_train3, eval_set=(X_val3, Y_val3))

Learning rate set to 0.114896
0:	learn: 2.2753219	test: 2.2777756	best: 2.2777756 (0)	total: 2.11s	remaining: 35m 9s
1:	learn: 2.2539139	test: 2.2584878	best: 2.2584878 (1)	total: 3.94s	remaining: 32m 49s
2:	learn: 2.2368582	test: 2.2435191	best: 2.2435191 (2)	total: 5.79s	remaining: 32m 4s
3:	learn: 2.2235557	test: 2.2319805	best: 2.2319805 (3)	total: 7.64s	remaining: 31m 42s
4:	learn: 2.2124724	test: 2.2228652	best: 2.2228652 (4)	total: 9.58s	remaining: 31m 47s
5:	learn: 2.2034820	test: 2.2157029	best: 2.2157029 (5)	total: 11.3s	remaining: 31m 11s
6:	learn: 2.1963767	test: 2.2101261	best: 2.2101261 (6)	total: 13s	remaining: 30m 46s
7:	learn: 2.1900538	test: 2.2059201	best: 2.2059201 (7)	total: 14.7s	remaining: 30m 22s
8:	learn: 2.1850316	test: 2.2023205	best: 2.2023205 (8)	total: 16.4s	remaining: 30m 9s
9:	learn: 2.1804231	test: 2.1997708	best: 2.1997708 (9)	total: 18.2s	remaining: 30m 2s
10:	learn: 2.1766201	test: 2.1974925	best: 2.1974925 (10)	total: 20.4s	remaining: 30m 37s
11:	le

<catboost.core.CatBoostClassifier at 0x1763a332710>

In [133]:
scoring(catboost3,X_train3,X_test3,Y_train3,Y_test3)

TEST RESULT
Accuracy: 0.193
Recall: 0.101
Precision: 0.054
F1 Score: 0.045
TRAIN RESULTS
Accuracy: 0.219
Recall: 0.116
Precision: 0.097
F1 Score: 0.064


In [141]:
df4 = pd.concat([emb_text4,emb_cat],axis=1)

In [142]:
final_data4 = df4.copy()

In [143]:
final_data4['ID_HISTORICAL_TICKET_ID'] = data['ID_HISTORICAL_TICKET_ID']

In [144]:
final_data4.drop(columns=['CD_TICKET_CODE'],inplace=True)

In [145]:
final_data4.dropna(inplace=True)

In [146]:
final_data4.shape

(39693, 1012)

In [147]:
X4 = final_data4.drop(columns = ['ID_HISTORICAL_TICKET_ID'])
y4 = final_data4[['ID_HISTORICAL_TICKET_ID']]
print("Shape of X is %s and shape \
    of y is %s" % (X4.shape, y4.shape))

Shape of X is (39693, 1011) and shape     of y is (39693, 1)


In [148]:
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(
    X4, y4, test_size=0.25, random_state=42)

In [149]:
X_train4, X_val4, Y_train4, Y_val4 = train_test_split(
    X_train4, Y_train4, test_size=0.25, random_state=42)

In [150]:
params = {
#             'learning_rate': 0.1,
#           'l2_leaf_reg': 3,
#           'iterations': 10,
          'random_seed': 42,
          'verbose':True,
#           'depth': 10,
          'use_best_model': True,
         }

catboost4 = CatBoostClassifier(**params)

In [151]:
catboost4.fit(X_train4, Y_train4, eval_set=(X_val4, Y_val4))

Learning rate set to 0.114896
0:	learn: 2.2754764	test: 2.2776836	best: 2.2776836 (0)	total: 4.3s	remaining: 1h 11m 40s
1:	learn: 2.2541876	test: 2.2581429	best: 2.2581429 (1)	total: 8.78s	remaining: 1h 12m 59s
2:	learn: 2.2373087	test: 2.2430152	best: 2.2430152 (2)	total: 13.1s	remaining: 1h 12m 17s
3:	learn: 2.2238094	test: 2.2313973	best: 2.2313973 (3)	total: 16.2s	remaining: 1h 7m 6s
4:	learn: 2.2130190	test: 2.2224089	best: 2.2224089 (4)	total: 19.1s	remaining: 1h 3m 11s
5:	learn: 2.2037533	test: 2.2151015	best: 2.2151015 (5)	total: 21.9s	remaining: 1h 27s
6:	learn: 2.1961293	test: 2.2094984	best: 2.2094984 (6)	total: 24.9s	remaining: 58m 46s
7:	learn: 2.1898711	test: 2.2051117	best: 2.2051117 (7)	total: 28s	remaining: 57m 54s
8:	learn: 2.1849235	test: 2.2013162	best: 2.2013162 (8)	total: 30.7s	remaining: 56m 17s
9:	learn: 2.1802927	test: 2.1984620	best: 2.1984620 (9)	total: 33.5s	remaining: 55m 18s
10:	learn: 2.1763222	test: 2.1958127	best: 2.1958127 (10)	total: 36.4s	remaining: 

<catboost.core.CatBoostClassifier at 0x1763a331ae0>

In [152]:
scoring(catboost4,X_train4,X_test4,Y_train4,Y_test4)

TEST RESULT
Accuracy: 0.194
Recall: 0.101
Precision: 0.057
F1 Score: 0.045
TRAIN RESULTS
Accuracy: 0.214
Recall: 0.113
Precision: 0.091
F1 Score: 0.061


In [8]:
model1

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [9]:
model2

SentenceTransformer(
  (0): Transformer({'max_seq_length': 514, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [10]:
model3

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [11]:
model4

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)