In [169]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

# breast_w

In [71]:
data = pd.read_csv('./data/breast_w/clean_breast_w.csv', index_col=0)

In [72]:
data.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhension,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [73]:
data = data.replace('?', None)
data = data.dropna()

In [74]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [75]:
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

In [76]:
%%time
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train)

CPU times: total: 13.6 s
Wall time: 1.17 s


In [77]:
y_pred = cat_boost.predict(x_test)

In [78]:
accuracy_score(y_test, y_pred)

0.9855072463768116

In [93]:
recall_score(y_test, y_pred, pos_label=4)

0.9666666666666667

In [95]:
precision_score(y_test, y_pred, pos_label=4)

1.0

In [118]:
f1_score(y_test, y_pred, pos_label=4)

0.983050847457627

# gender

In [139]:
data = pd.read_csv('./data/gender/gender.csv', index_col=0)

In [140]:
data.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [141]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66 entries, 0 to 65
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Favorite Color        66 non-null     object
 1   Favorite Music Genre  66 non-null     object
 2   Favorite Beverage     66 non-null     object
 3   Favorite Soft Drink   66 non-null     object
 4   Gender                66 non-null     object
dtypes: object(5)
memory usage: 3.1+ KB


In [142]:
data['Favorite Color'].unique()

array(['Cool', 'Neutral', 'Warm'], dtype=object)

In [143]:
data['Favorite Music Genre'].unique()

array(['Rock', 'Hip hop', 'Folk/Traditional', 'Jazz/Blues', 'Pop',
       'Electronic', 'R&B and soul'], dtype=object)

In [144]:
data['Favorite Beverage'].unique()

array(['Vodka', 'Wine', 'Whiskey', "Doesn't drink", 'Beer', 'Other'],
      dtype=object)

In [145]:
data['Favorite Soft Drink'].unique()

array(['7UP/Sprite', 'Coca Cola/Pepsi', 'Fanta', 'Other'], dtype=object)

In [146]:
data['Gender'].unique()

array(['F', 'M'], dtype=object)

In [147]:
data.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


In [150]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [151]:
cat_indexes = [0,1,2,3]
x_train = train_df.drop(columns=['Gender'])
y_train = train_df['Gender']
x_test = test_df.drop(columns=['Gender'])
y_test = test_df['Gender']

In [153]:
%%time
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_indexes)

Learning rate set to 0.003077
0:	learn: 0.6921041	total: 29.1ms	remaining: 29.1s
1:	learn: 0.6914313	total: 62.1ms	remaining: 31s
2:	learn: 0.6908038	total: 89.9ms	remaining: 29.9s
3:	learn: 0.6900359	total: 118ms	remaining: 29.4s
4:	learn: 0.6890999	total: 152ms	remaining: 30.2s
5:	learn: 0.6885395	total: 198ms	remaining: 32.8s
6:	learn: 0.6874025	total: 231ms	remaining: 32.7s
7:	learn: 0.6866383	total: 264ms	remaining: 32.8s
8:	learn: 0.6861279	total: 281ms	remaining: 30.9s
9:	learn: 0.6848619	total: 314ms	remaining: 31.1s
10:	learn: 0.6846566	total: 331ms	remaining: 29.8s
11:	learn: 0.6837002	total: 365ms	remaining: 30s
12:	learn: 0.6829816	total: 399ms	remaining: 30.3s
13:	learn: 0.6822836	total: 433ms	remaining: 30.5s
14:	learn: 0.6813561	total: 483ms	remaining: 31.7s
15:	learn: 0.6807401	total: 517ms	remaining: 31.8s
16:	learn: 0.6798607	total: 550ms	remaining: 31.8s
17:	learn: 0.6794767	total: 573ms	remaining: 31.3s
18:	learn: 0.6794150	total: 592ms	remaining: 30.6s
19:	learn: 0

167:	learn: 0.5830989	total: 5.37s	remaining: 26.6s
168:	learn: 0.5826496	total: 5.39s	remaining: 26.5s
169:	learn: 0.5820775	total: 5.42s	remaining: 26.5s
170:	learn: 0.5814762	total: 5.47s	remaining: 26.5s
171:	learn: 0.5814760	total: 5.47s	remaining: 26.3s
172:	learn: 0.5808568	total: 5.51s	remaining: 26.3s
173:	learn: 0.5802708	total: 5.54s	remaining: 26.3s
174:	learn: 0.5797376	total: 5.58s	remaining: 26.3s
175:	learn: 0.5796695	total: 5.59s	remaining: 26.2s
176:	learn: 0.5790261	total: 5.62s	remaining: 26.1s
177:	learn: 0.5784504	total: 5.65s	remaining: 26.1s
178:	learn: 0.5783625	total: 5.68s	remaining: 26.1s
179:	learn: 0.5777507	total: 5.72s	remaining: 26s
180:	learn: 0.5776901	total: 5.73s	remaining: 25.9s
181:	learn: 0.5774392	total: 5.75s	remaining: 25.8s
182:	learn: 0.5766403	total: 5.78s	remaining: 25.8s
183:	learn: 0.5757271	total: 5.81s	remaining: 25.8s
184:	learn: 0.5753472	total: 5.85s	remaining: 25.8s
185:	learn: 0.5752142	total: 5.87s	remaining: 25.7s
186:	learn: 0.

330:	learn: 0.5035209	total: 10.5s	remaining: 21.2s
331:	learn: 0.5030253	total: 10.5s	remaining: 21.1s
332:	learn: 0.5026365	total: 10.5s	remaining: 21.1s
333:	learn: 0.5022752	total: 10.6s	remaining: 21.1s
334:	learn: 0.5018842	total: 10.6s	remaining: 21.1s
335:	learn: 0.5013478	total: 10.7s	remaining: 21.1s
336:	learn: 0.5008182	total: 10.7s	remaining: 21s
337:	learn: 0.5005456	total: 10.7s	remaining: 21s
338:	learn: 0.5001532	total: 10.7s	remaining: 21s
339:	learn: 0.4999368	total: 10.8s	remaining: 20.9s
340:	learn: 0.4995364	total: 10.8s	remaining: 20.9s
341:	learn: 0.4989614	total: 10.8s	remaining: 20.9s
342:	learn: 0.4984093	total: 10.9s	remaining: 20.9s
343:	learn: 0.4980832	total: 10.9s	remaining: 20.8s
344:	learn: 0.4977050	total: 10.9s	remaining: 20.8s
345:	learn: 0.4969355	total: 11s	remaining: 20.8s
346:	learn: 0.4965545	total: 11s	remaining: 20.7s
347:	learn: 0.4960757	total: 11.1s	remaining: 20.7s
348:	learn: 0.4956612	total: 11.1s	remaining: 20.7s
349:	learn: 0.4956337	

491:	learn: 0.4379561	total: 15.8s	remaining: 16.3s
492:	learn: 0.4373647	total: 15.8s	remaining: 16.3s
493:	learn: 0.4371346	total: 15.9s	remaining: 16.3s
494:	learn: 0.4366843	total: 15.9s	remaining: 16.2s
495:	learn: 0.4363934	total: 15.9s	remaining: 16.2s
496:	learn: 0.4363359	total: 16s	remaining: 16.1s
497:	learn: 0.4359897	total: 16s	remaining: 16.1s
498:	learn: 0.4354721	total: 16s	remaining: 16.1s
499:	learn: 0.4349756	total: 16.1s	remaining: 16.1s
500:	learn: 0.4349756	total: 16.1s	remaining: 16s
501:	learn: 0.4345022	total: 16.1s	remaining: 16s
502:	learn: 0.4342632	total: 16.2s	remaining: 16s
503:	learn: 0.4338096	total: 16.2s	remaining: 15.9s
504:	learn: 0.4336299	total: 16.2s	remaining: 15.9s
505:	learn: 0.4332157	total: 16.3s	remaining: 15.9s
506:	learn: 0.4327571	total: 16.3s	remaining: 15.8s
507:	learn: 0.4319018	total: 16.3s	remaining: 15.8s
508:	learn: 0.4311513	total: 16.4s	remaining: 15.8s
509:	learn: 0.4306287	total: 16.4s	remaining: 15.8s
510:	learn: 0.4299306	to

655:	learn: 0.3802819	total: 21.2s	remaining: 11.1s
656:	learn: 0.3800229	total: 21.2s	remaining: 11.1s
657:	learn: 0.3796991	total: 21.3s	remaining: 11.1s
658:	learn: 0.3795072	total: 21.3s	remaining: 11s
659:	learn: 0.3789804	total: 21.3s	remaining: 11s
660:	learn: 0.3788366	total: 21.4s	remaining: 11s
661:	learn: 0.3787741	total: 21.4s	remaining: 10.9s
662:	learn: 0.3784942	total: 21.4s	remaining: 10.9s
663:	learn: 0.3782091	total: 21.5s	remaining: 10.9s
664:	learn: 0.3780130	total: 21.5s	remaining: 10.8s
665:	learn: 0.3778053	total: 21.5s	remaining: 10.8s
666:	learn: 0.3774626	total: 21.6s	remaining: 10.8s
667:	learn: 0.3769243	total: 21.6s	remaining: 10.8s
668:	learn: 0.3768262	total: 21.7s	remaining: 10.7s
669:	learn: 0.3761743	total: 21.7s	remaining: 10.7s
670:	learn: 0.3759993	total: 21.8s	remaining: 10.7s
671:	learn: 0.3758747	total: 21.8s	remaining: 10.6s
672:	learn: 0.3753884	total: 21.9s	remaining: 10.6s
673:	learn: 0.3750421	total: 21.9s	remaining: 10.6s
674:	learn: 0.3747

818:	learn: 0.3383500	total: 26.5s	remaining: 5.85s
819:	learn: 0.3382549	total: 26.5s	remaining: 5.82s
820:	learn: 0.3380711	total: 26.5s	remaining: 5.78s
821:	learn: 0.3377581	total: 26.6s	remaining: 5.75s
822:	learn: 0.3372346	total: 26.6s	remaining: 5.72s
823:	learn: 0.3370700	total: 26.6s	remaining: 5.69s
824:	learn: 0.3367319	total: 26.7s	remaining: 5.66s
825:	learn: 0.3364005	total: 26.7s	remaining: 5.62s
826:	learn: 0.3360898	total: 26.7s	remaining: 5.59s
827:	learn: 0.3359631	total: 26.8s	remaining: 5.56s
828:	learn: 0.3354611	total: 26.8s	remaining: 5.53s
829:	learn: 0.3354611	total: 26.8s	remaining: 5.49s
830:	learn: 0.3352501	total: 26.9s	remaining: 5.46s
831:	learn: 0.3349967	total: 26.9s	remaining: 5.43s
832:	learn: 0.3347622	total: 26.9s	remaining: 5.4s
833:	learn: 0.3343527	total: 26.9s	remaining: 5.36s
834:	learn: 0.3340606	total: 27s	remaining: 5.33s
835:	learn: 0.3337953	total: 27s	remaining: 5.3s
836:	learn: 0.3335491	total: 27s	remaining: 5.27s
837:	learn: 0.333259

983:	learn: 0.2981757	total: 31.8s	remaining: 518ms
984:	learn: 0.2981757	total: 31.8s	remaining: 485ms
985:	learn: 0.2979554	total: 31.9s	remaining: 453ms
986:	learn: 0.2978944	total: 31.9s	remaining: 420ms
987:	learn: 0.2978185	total: 31.9s	remaining: 388ms
988:	learn: 0.2976197	total: 32s	remaining: 356ms
989:	learn: 0.2972177	total: 32s	remaining: 323ms
990:	learn: 0.2968543	total: 32s	remaining: 291ms
991:	learn: 0.2965119	total: 32.1s	remaining: 259ms
992:	learn: 0.2963405	total: 32.1s	remaining: 227ms
993:	learn: 0.2959092	total: 32.2s	remaining: 194ms
994:	learn: 0.2957986	total: 32.2s	remaining: 162ms
995:	learn: 0.2956002	total: 32.2s	remaining: 129ms
996:	learn: 0.2953323	total: 32.3s	remaining: 97.1ms
997:	learn: 0.2950865	total: 32.3s	remaining: 64.7ms
998:	learn: 0.2950517	total: 32.3s	remaining: 32.4ms
999:	learn: 0.2948616	total: 32.4s	remaining: 0us
CPU times: total: 1min 3s
Wall time: 32.6 s


In [154]:
y_pred = cat_boost.predict(x_test)

In [155]:
accuracy_score(y_test, y_pred)

0.42857142857142855

In [158]:
recall_score(y_test, y_pred, pos_label='M')

0.25

In [159]:
precision_score(y_test, y_pred, pos_label='M')

0.5

In [160]:
f1_score(y_test, y_pred, pos_label='M')

0.3333333333333333

# german

In [161]:
data=pd.read_csv('./data/german/german.data',header=None)
data.columns=['status_of_existing_checking_account duration_(months) credit_history purpose credit_amount savings_account/bonds present_employment_since installment_rate personal_status_sex other_debtors present_residence property age other_installment_plans housing number_of_existing_credits job no._of_people_being_liable_to_provide_maintenance telephone foreign_worker class']
data[data.columns[0].split(' ')] = data.iloc[:,0].str.split(' ', expand=True)
data.drop(data.columns[0], axis=1, inplace=True)

In [162]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   status_of_existing_checking_account                1000 non-null   object
 1   duration_(months)                                  1000 non-null   object
 2   credit_history                                     1000 non-null   object
 3   purpose                                            1000 non-null   object
 4   credit_amount                                      1000 non-null   object
 5   savings_account/bonds                              1000 non-null   object
 6   present_employment_since                           1000 non-null   object
 7   installment_rate                                   1000 non-null   object
 8   personal_status_sex                                1000 non-null   object
 9   other_debtors       

In [170]:
data.describe()

Unnamed: 0,status_of_existing_checking_account,duration_(months),credit_history,purpose,credit_amount,savings_account/bonds,present_employment_since,installment_rate,personal_status_sex,other_debtors,present_residence,property,age,other_installment_plans,housing,number_of_existing_credits,job,no._of_people_being_liable_to_provide_maintenance,telephone,foreign_worker,class
count,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000
unique,4,33,5,10,921,5,5,4,4,3,4,4,53,3,3,4,4,2,2,2,2
top,A14,24,A32,A43,1478,A61,A73,4,A93,A101,4,A123,27,A143,A152,1,A173,1,A191,A201,1
freq,394,184,530,280,3,603,339,476,548,907,413,332,51,814,713,633,630,845,596,963,700


In [112]:
data['duration_(months)'].unique()

array(['6', '48', '12', '42', '24', '36', '30', '15', '9', '10', '7',
       '60', '18', '45', '11', '27', '8', '54', '20', '14', '33', '21',
       '16', '4', '47', '13', '22', '39', '28', '5', '26', '72', '40'],
      dtype=object)

In [172]:
int_columns = ['duration_(months)','credit_amount', 'age']
data[int_columns] = data[int_columns].astype('float')

In [174]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   status_of_existing_checking_account                1000 non-null   object 
 1   duration_(months)                                  1000 non-null   float64
 2   credit_history                                     1000 non-null   object 
 3   purpose                                            1000 non-null   object 
 4   credit_amount                                      1000 non-null   float64
 5   savings_account/bonds                              1000 non-null   object 
 6   present_employment_since                           1000 non-null   object 
 7   installment_rate                                   1000 non-null   object 
 8   personal_status_sex                                1000 non-null   object 
 9   other_deb

In [175]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [176]:
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

In [178]:
%%time
cat_columns = [0,2,3,5,6,7,8,9,10,11,13,14,15,16,17,18,19]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

CPU times: total: 1min 50s
Wall time: 39.1 s


In [179]:
y_pred = cat_boost.predict(x_test)

In [180]:
accuracy_score(y_test, y_pred)

0.78

In [183]:
recall_score(y_test, y_pred, pos_label='1')

0.8732394366197183

In [184]:
precision_score(y_test, y_pred, pos_label='1')

0.8266666666666667

In [185]:
f1_score(y_test, y_pred, pos_label='1')

0.8493150684931506

# Hepatitis

In [190]:
df_hepatitis=pd.read_csv('./data/hepatitis/hepatitis.csv',header=None)
df_hepatitis.columns=['class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,big liver,firm liver,palpable spleen,spiders,ascites,varices,bilirubin,phosphate,sgot,albumin,protime,histology']
df_hepatitis[df_hepatitis.columns[0].split(',')] = df_hepatitis.iloc[:,0].str.split(',', expand=True)
df_hepatitis.drop(df_hepatitis.columns[0],axis=1,inplace=True)
df_hepatitis=df_hepatitis.replace('?',np.nan).dropna().reset_index(drop=True)
df_hepatitis['age']=df_hepatitis['age'].astype('int')
df_hepatitis['phosphate']=df_hepatitis['phosphate'].astype('int')
df_hepatitis['sgot']=df_hepatitis['sgot'].astype('int')
df_hepatitis['albumin']=df_hepatitis['albumin'].astype('float')
df_hepatitis['protime']=df_hepatitis['protime'].astype('float')
df_hepatitis['bilirubin']=df_hepatitis['bilirubin'].astype('float')
data = df_hepatitis.copy()
df_hepatitis=None

In [191]:
data.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,big liver,firm liver,palpable spleen,spiders,ascites,varices,bilirubin,phosphate,sgot,albumin,protime,histology
0,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75.0,1
1,2,39,1,1,1,2,2,2,1,1,2,2,2,2,1.3,78,30,4.4,85.0,1
2,2,32,1,2,1,1,2,2,2,1,2,1,2,2,1.0,59,249,3.7,54.0,1
3,2,41,1,2,1,1,2,2,2,1,2,2,2,2,0.9,81,60,3.9,52.0,1
4,2,30,1,2,2,1,2,2,2,1,2,2,2,2,2.2,57,144,4.9,78.0,1


In [193]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   class            80 non-null     object 
 1   age              80 non-null     int32  
 2   sex              80 non-null     object 
 3   steroid          80 non-null     object 
 4   antivirals       80 non-null     object 
 5   fatigue          80 non-null     object 
 6   malaise          80 non-null     object 
 7   anorexia         80 non-null     object 
 8   big liver        80 non-null     object 
 9   firm liver       80 non-null     object 
 10  palpable spleen  80 non-null     object 
 11  spiders          80 non-null     object 
 12  ascites          80 non-null     object 
 13  varices          80 non-null     object 
 14  bilirubin        80 non-null     float64
 15  phosphate        80 non-null     int32  
 16  sgot             80 non-null     int32  
 17  albumin          8

In [192]:
data.describe()

Unnamed: 0,age,bilirubin,phosphate,sgot,albumin,protime
count,80.0,80.0,80.0,80.0,80.0,80.0
mean,40.6625,1.22125,102.9125,82.025,3.84375,62.5125
std,11.28003,0.875213,53.684779,71.599974,0.576292,23.427774
min,20.0,0.3,26.0,14.0,2.1,0.0
25%,32.0,0.7,68.25,30.75,3.5,46.0
50%,38.5,1.0,85.0,56.5,4.0,62.0
75%,49.25,1.3,133.5,102.75,4.2,77.25
max,72.0,4.8,280.0,420.0,5.0,100.0


In [194]:
int_columns = ['age', 'bilirubin', 'phosphate', 'sgot', 'albumin', 'protime']
data[int_columns] = data[int_columns].astype('float')

In [200]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [201]:
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

In [203]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 4 to 51
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              72 non-null     float64
 1   sex              72 non-null     object 
 2   steroid          72 non-null     object 
 3   antivirals       72 non-null     object 
 4   fatigue          72 non-null     object 
 5   malaise          72 non-null     object 
 6   anorexia         72 non-null     object 
 7   big liver        72 non-null     object 
 8   firm liver       72 non-null     object 
 9   palpable spleen  72 non-null     object 
 10  spiders          72 non-null     object 
 11  ascites          72 non-null     object 
 12  varices          72 non-null     object 
 13  bilirubin        72 non-null     float64
 14  phosphate        72 non-null     float64
 15  sgot             72 non-null     float64
 16  albumin          72 non-null     float64
 17  protime          7

In [204]:
%%time
cat_columns = [1,2,3,4,5,6,7,8,9,10,11,12,18]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

CPU times: total: 14.3 s
Wall time: 1.34 s


In [205]:
y_pred = cat_boost.predict(x_test)

In [206]:
accuracy_score(y_test, y_pred)

0.875

In [212]:
recall_score(y_test, y_pred, pos_label='2')

1.0

In [213]:
precision_score(y_test, y_pred, pos_label='2')

0.875

In [211]:
f1_score(y_test, y_pred, pos_label='2')

0.9333333333333333

# mobile

In [214]:
data = pd.read_csv('./data/mobile/train.csv')

In [215]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [216]:
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,9.9165,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,6.064315,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,0.0,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,5.0,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,10.0,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,15.0,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,20.0,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [217]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [218]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [220]:
x_train = train_df.drop(columns=['price_range'])
y_train = train_df['price_range']
x_test = test_df.drop(columns=['price_range'])
y_test = test_df['price_range']

In [221]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1800 entries, 1872 to 1126
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  1800 non-null   int64  
 1   blue           1800 non-null   int64  
 2   clock_speed    1800 non-null   float64
 3   dual_sim       1800 non-null   int64  
 4   fc             1800 non-null   int64  
 5   four_g         1800 non-null   int64  
 6   int_memory     1800 non-null   int64  
 7   m_dep          1800 non-null   float64
 8   mobile_wt      1800 non-null   int64  
 9   n_cores        1800 non-null   int64  
 10  pc             1800 non-null   int64  
 11  px_height      1800 non-null   int64  
 12  px_width       1800 non-null   int64  
 13  ram            1800 non-null   int64  
 14  sc_h           1800 non-null   int64  
 15  sc_w           1800 non-null   int64  
 16  talk_time      1800 non-null   int64  
 17  three_g        1800 non-null   int64  
 18  touch

In [222]:
%%time
cat_columns = [1,3,5,9,17,18,19]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

CPU times: total: 1min 29s
Wall time: 40.2 s


In [223]:
y_pred = cat_boost.predict(x_test)

In [224]:
accuracy_score(y_test, y_pred)

0.93

# stroke

In [231]:
df_train=pd.read_csv('./data/stroke/train.csv')
df_test=pd.read_csv('./data/stroke/test.csv')
df_merged=pd.concat([df_train,df_test])
df_merged=df_merged.replace('*82','82')
data=df_merged.drop(columns=['id']).dropna()

In [232]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,31,0,0,No,Govt_job,Rural,70.66,27.2,never smoked,0.0
1,Female,13,0,0,No,children,Rural,85.81,18.6,Unknown,0.0
2,Male,18,0,0,No,Private,Urban,60.56,33.0,never smoked,0.0
3,Female,65,0,0,Yes,Private,Urban,205.77,46.0,formerly smoked,1.0
4,Male,4,0,0,No,children,Rural,90.42,16.2,Unknown,0.0


In [235]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1085 entries, 0 to 1136
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             1085 non-null   object 
 1   age                1085 non-null   object 
 2   hypertension       1085 non-null   int64  
 3   heart_disease      1085 non-null   int64  
 4   ever_married       1085 non-null   object 
 5   work_type          1085 non-null   object 
 6   Residence_type     1085 non-null   object 
 7   avg_glucose_level  1085 non-null   float64
 8   bmi                1085 non-null   float64
 9   smoking_status     1085 non-null   object 
 10  stroke             1085 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 101.7+ KB


In [236]:
data.describe()

Unnamed: 0,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,1085.0,1085.0,1085.0,1085.0,1085.0
mean,0.111521,0.064516,106.913945,29.198065,0.106912
std,0.314921,0.245783,47.168513,7.669615,0.309144
min,0.0,0.0,55.27,11.3,0.0
25%,0.0,0.0,77.26,24.1,0.0
50%,0.0,0.0,91.61,28.5,0.0
75%,0.0,0.0,113.47,33.2,0.0
max,1.0,1.0,266.59,64.4,1.0


In [238]:
int_columns = ['age', 'avg_glucose_level', 'bmi']
data[int_columns] = data[int_columns].astype('float')

In [239]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [241]:
x_train = train_df.drop(columns=['stroke'])
y_train = train_df['stroke']
x_test = test_df.drop(columns=['stroke'])
y_test = test_df['stroke']

In [242]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 976 entries, 319 to 903
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             976 non-null    object 
 1   age                976 non-null    float64
 2   hypertension       976 non-null    int64  
 3   heart_disease      976 non-null    int64  
 4   ever_married       976 non-null    object 
 5   work_type          976 non-null    object 
 6   Residence_type     976 non-null    object 
 7   avg_glucose_level  976 non-null    float64
 8   bmi                976 non-null    float64
 9   smoking_status     976 non-null    object 
dtypes: float64(3), int64(2), object(5)
memory usage: 83.9+ KB


In [243]:
%%time
cat_columns = [0,2,3,4,5,6,9]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

CPU times: total: 1min 15s
Wall time: 32.4 s


In [244]:
y_pred = cat_boost.predict(x_test)

In [255]:
y_pred

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [245]:
accuracy_score(y_test, y_pred)

0.8623853211009175

In [249]:
recall_score(y_test, y_pred, pos_label=0)

0.9791666666666666

In [250]:
precision_score(y_test, y_pred, pos_label=0)

0.8785046728971962

In [251]:
f1_score(y_test, y_pred, pos_label=0)

0.9261083743842364

# tic tac toe

In [258]:
data = pd.read_csv('./data/tic_tac_toe/clean_tic-tac-toe.csv', index_col=0)

In [259]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 958 entries, 0 to 957
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   top-left       958 non-null    object
 1   top-middle     958 non-null    object
 2   top-right      958 non-null    object
 3   middle-left    958 non-null    object
 4   middle-middle  958 non-null    object
 5   middle-right   958 non-null    object
 6   bottom-left    958 non-null    object
 7   bottom-middle  958 non-null    object
 8   bottom-right   958 non-null    object
 9   class          958 non-null    object
dtypes: object(10)
memory usage: 82.3+ KB


In [260]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [261]:
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

In [263]:
%%time
cat_columns = [0,1,2,3,4,5,6,7,8]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

CPU times: total: 1min 33s
Wall time: 38.7 s


In [264]:
y_pred = cat_boost.predict(x_test)

In [265]:
accuracy_score(y_test, y_pred)

1.0

In [267]:
recall_score(y_test, y_pred, pos_label='positive')

1.0

In [268]:
precision_score(y_test, y_pred, pos_label='positive')

1.0

In [269]:
f1_score(y_test, y_pred, pos_label='positive')

1.0

# zoo

In [274]:
data = pd.read_csv('./data/zoo/clean_zoo.csv', index_col=0)

In [275]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101 entries, 0 to 100
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   hair      101 non-null    int64
 1   feathers  101 non-null    int64
 2   eggs      101 non-null    int64
 3   milk      101 non-null    int64
 4   airborne  101 non-null    int64
 5   aquatic   101 non-null    int64
 6   predator  101 non-null    int64
 7   toothed   101 non-null    int64
 8   backbone  101 non-null    int64
 9   breathes  101 non-null    int64
 10  venomous  101 non-null    int64
 11  fins      101 non-null    int64
 12  legs      101 non-null    int64
 13  tail      101 non-null    int64
 14  domestic  101 non-null    int64
 15  catsize   101 non-null    int64
 16  type      101 non-null    int64
dtypes: int64(17)
memory usage: 14.2 KB


In [276]:
data.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [283]:
data.describe()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.425743,0.19802,0.584158,0.405941,0.237624,0.356436,0.554455,0.60396,0.821782,0.792079,0.079208,0.168317,2.841584,0.742574,0.128713,0.435644,2.831683
std,0.496921,0.400495,0.495325,0.493522,0.42775,0.481335,0.499505,0.491512,0.384605,0.407844,0.27141,0.376013,2.033385,0.439397,0.336552,0.498314,2.102709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,2.0
75%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,1.0,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,7.0


In [277]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101 entries, 0 to 100
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   hair      101 non-null    int64
 1   feathers  101 non-null    int64
 2   eggs      101 non-null    int64
 3   milk      101 non-null    int64
 4   airborne  101 non-null    int64
 5   aquatic   101 non-null    int64
 6   predator  101 non-null    int64
 7   toothed   101 non-null    int64
 8   backbone  101 non-null    int64
 9   breathes  101 non-null    int64
 10  venomous  101 non-null    int64
 11  fins      101 non-null    int64
 12  legs      101 non-null    int64
 13  tail      101 non-null    int64
 14  domestic  101 non-null    int64
 15  catsize   101 non-null    int64
 16  type      101 non-null    int64
dtypes: int64(17)
memory usage: 14.2 KB


In [288]:
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

In [289]:
x_train = train_df.drop(columns=['type'])
y_train = train_df['type']
x_test = test_df.drop(columns=['type'])
y_test = test_df['type']

In [290]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90 entries, 0 to 93
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   hair      90 non-null     int64
 1   feathers  90 non-null     int64
 2   eggs      90 non-null     int64
 3   milk      90 non-null     int64
 4   airborne  90 non-null     int64
 5   aquatic   90 non-null     int64
 6   predator  90 non-null     int64
 7   toothed   90 non-null     int64
 8   backbone  90 non-null     int64
 9   breathes  90 non-null     int64
 10  venomous  90 non-null     int64
 11  fins      90 non-null     int64
 12  legs      90 non-null     int64
 13  tail      90 non-null     int64
 14  domestic  90 non-null     int64
 15  catsize   90 non-null     int64
dtypes: int64(16)
memory usage: 12.0 KB


In [291]:
%%time
cat_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

CPU times: total: 1min 33s
Wall time: 42.6 s


In [292]:
y_pred = cat_boost.predict(x_test)

In [293]:
accuracy_score(y_test, y_pred)

1.0