In [3]:
import pandas as pd
import numpy as np
import time
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

# breast_w

In [5]:
data = pd.read_csv('./data/breast_w/clean_breast_w.csv', index_col=0)

data = data.replace('?', None)
data = data.dropna()

# Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

cat_boost = CatBoostClassifier(verbose=False)
# Calculating Fit Time
start = time.time()
cat_boost = cat_boost.fit(x_train, y_train)
end = time.time()

# Test Accuracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)


print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 0.9127762317657471
-------------Test-------------
Test Accuracy:0.9855072463768116


# gender

In [9]:
data = pd.read_csv('./data/gender/gender.csv', index_col=0)

#Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)

cat_indexes = [0,1,2,3]
x_train = train_df.drop(columns=['Gender'])
y_train = train_df['Gender']
x_test = test_df.drop(columns=['Gender'])
y_test = test_df['Gender']

cat_boost = CatBoostClassifier(verbose=False)
# Calculating Fit Time
start = time.time()
cat_boost = cat_boost.fit(x_train, y_train,cat_indexes)
end = time.time()

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 1.9523448944091797
-------------Test-------------
Test Accuracy:0.42857142857142855


# german

In [10]:
data=pd.read_csv('./data/german/german.data',header=None)

# Data Cleaning
data.columns=['status_of_existing_checking_account duration_(months) credit_history purpose credit_amount savings_account/bonds present_employment_since installment_rate personal_status_sex other_debtors present_residence property age other_installment_plans housing number_of_existing_credits job no._of_people_being_liable_to_provide_maintenance telephone foreign_worker class']
data[data.columns[0].split(' ')] = data.iloc[:,0].str.split(' ', expand=True)
data.drop(data.columns[0], axis=1, inplace=True)
int_columns = ['duration_(months)','credit_amount', 'age']
data[int_columns] = data[int_columns].astype('float')

# Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

cat_columns = [0,2,3,5,6,7,8,9,10,11,13,14,15,16,17,18,19]
cat_boost = CatBoostClassifier(verbose=False)
start = time.time()
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)
end = time.time()

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')


-------------Train-------------
Fit Time: 4.8281309604644775
-------------Test-------------
Test Accuracy:0.78


# Hepatitis

In [11]:
df_hepatitis=pd.read_csv('./data/hepatitis/hepatitis.csv',header=None)

# Data Cleaning
df_hepatitis.columns=['class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,big liver,firm liver,palpable spleen,spiders,ascites,varices,bilirubin,phosphate,sgot,albumin,protime,histology']
df_hepatitis[df_hepatitis.columns[0].split(',')] = df_hepatitis.iloc[:,0].str.split(',', expand=True)
df_hepatitis.drop(df_hepatitis.columns[0],axis=1,inplace=True)
df_hepatitis=df_hepatitis.replace('?',np.nan).dropna().reset_index(drop=True)
data = df_hepatitis.copy()
df_hepatitis=None
int_columns = ['age', 'bilirubin', 'phosphate', 'sgot', 'albumin', 'protime']
data[int_columns] = data[int_columns].astype('float')

# Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

#Calculating Fit Time
cat_columns = [1,2,3,4,5,6,7,8,9,10,11,12,18]
cat_boost = CatBoostClassifier(verbose=False)
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 4.8281309604644775
-------------Test-------------
Test Accuracy:0.875


# mobile

In [15]:
data = pd.read_csv('./data/mobile/train.csv')

#Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['price_range'])
y_train = train_df['price_range']
x_test = test_df.drop(columns=['price_range'])
y_test = test_df['price_range']

#Calculating Fit Time
cat_columns = [1,3,5,9,17,18,19]
cat_boost = CatBoostClassifier(verbose=False)
start = time.time()
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)
end = time.time()

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 4.802430868148804
-------------Test-------------
Test Accuracy:0.93


# stroke

In [16]:
df_train=pd.read_csv('./data/stroke/train.csv')
df_test=pd.read_csv('./data/stroke/test.csv')
df_merged=pd.concat([df_train,df_test])

#Data Cleaning
df_merged=df_merged.replace('*82','82')
data=df_merged.drop(columns=['id']).dropna()

int_columns = ['age', 'avg_glucose_level', 'bmi']
data[int_columns] = data[int_columns].astype('float')

#Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['stroke'])
y_train = train_df['stroke']
x_test = test_df.drop(columns=['stroke'])
y_test = test_df['stroke']

#Calculating Fit Time
cat_columns = [0,2,3,4,5,6,9]
cat_boost = CatBoostClassifier(verbose=False)
start = time.time()
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)
end = time.time()

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 2.8142902851104736
-------------Test-------------
Test Accuracy:0.8623853211009175


# tic tac toe

In [17]:
data = pd.read_csv('./data/tic_tac_toe/clean_tic-tac-toe.csv', index_col=0)

# Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['class'])
y_train = train_df['class']
x_test = test_df.drop(columns=['class'])
y_test = test_df['class']

#Calculating Fit Time
cat_columns = [0,1,2,3,4,5,6,7,8]
cat_boost = CatBoostClassifier(verbose=False)
start =time.time()
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)
end = time.time()

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 4.050205945968628
-------------Test-------------
Test Accuracy:1.0


# zoo

In [18]:
data = pd.read_csv('./data/zoo/clean_zoo.csv', index_col=0)

#Train Test Split
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
x_train = train_df.drop(columns=['type'])
y_train = train_df['type']
x_test = test_df.drop(columns=['type'])
y_test = test_df['type']

#Calculating Fit Time
cat_columns = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
cat_boost = CatBoostClassifier(verbose=False)
start = time.time()
cat_boost = cat_boost.fit(x_train, y_train, cat_columns)
end = time.time()

# Test Accucracy
y_pred = cat_boost.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

print('-------------Train-------------')
print(f'Fit Time: {end-start}')

print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}')

-------------Train-------------
Fit Time: 4.8560779094696045
-------------Test-------------
Test Accuracy:1.0
