In [61]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import time
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
import warnings
warnings.filterwarnings("ignore")

## breast_w

In [295]:
df_breast_w=pd.read_csv('./data/breast_w/clean_breast_w.csv',index_col=0)

train_df,test_df=train_test_split(df_breast_w,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y=test_df[test_df.columns[-1]]


#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label=4)
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label=4)
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label=4)

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label=4)
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label=4)
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label=4)
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}, Test Recall:{test_recall}, Test Precision:{test_precision}, Test F1:{test_f1} ')


-------------Train-------------
Accuracy: 0.9765625, Recall: 0.9491525423728814, Precision: 0.9824561403508771, F1: 0.9655172413793103, Fit Time: 0.22424983978271484
-------------Test-------------
Test Accuracy:0.9590643274853801, Test Recall:0.9841269841269841, Test Precision:0.9117647058823529, Test F1:0.9465648854961831 


## Gender

In [287]:
df_gender = pd.read_csv('./data/gender/gender.csv',index_col=0)

#One Hot Encoding
encoded_df = pd.get_dummies(df_gender[df_gender.columns[0:-2]]).merge(df_gender[df_gender.columns[-1]],left_index=True,right_index=True)

#Train test split
train_df,test_df=train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label='M')
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label="M")
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label="M")

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label="M")
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label="M")
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label="M")
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')



-------------Train-------------
Accuracy: 0.8979591836734694, Recall: 0.8571428571428571, Precision: 0.96, F1: 0.9056603773584904, Fit Time: 0.19206023216247559
-------------Test-------------
Accuracy:0.5882352941176471, Recall:0.5555555555555556, Precision:0.625, F1:0.5882352941176471 


## German

In [284]:
df_german = pd.read_csv('./data/german/clean_german.csv',index_col=0)

#One Hot Encoding & Ordinal Encoding
encoded_df = pd.DataFrame()
encoded_df['status_of_existing_checking_account']=df_german['status_of_existing_checking_account'].replace({"A14":0,"A11":1,"A12":2,"A13":3})
encoded_df['duration_(months)']=df_german['duration_(months)'].replace({"0-20":0,"20-40":1,"40-60":2,"60-80":3})
encoded_df['credit_amount']=df_german['credit_amount'].replace({'0-2500':0,'2500-5000':1,'5000-7500':2,'7500-10000':3,'10000-12500':4,'12500-15000':5,'15000-17500':6,'17500-20000':7})
encoded_df['savings_account/bonds']=df_german['savings_account/bonds'].replace({"A65":0,"A61":1,"A62":2,"A63":3,"A64":4})
encoded_df['present_employment_since']=df_german['present_employment_since'].replace({"A71":0,"A72":1,'A73':2,'A74':3,"A75":4})
encoded_df['age']=df_german['age'].replace({"10-20":0,'20-30':1,'30-40':2,'40-50':3,'50-60':4,'60-70':5,'70-80':6})
encoded_df['job']=df_german['job'].replace({'A171':0,"A172":1,'A173':2,'A174':3})
encoded_df= encoded_df.merge(pd.get_dummies(df_german['credit_history']).merge(pd.get_dummies(df_german['purpose']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['personal_status_sex']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['other_debtors']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['property']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['other_installment_plans']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['housing']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['telephone']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['foreign_worker']),left_index=True,right_index=True),left_index=True,right_index=True)
encoded_df['class']=df_german['class']


#Train test split
train_df,test_df=train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label=1)
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label=1)
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label=1)

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label=1)
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label=1)
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label=1)
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')



-------------Train-------------
Accuracy: 0.7466666666666667, Recall: 0.7338028169014085, Precision: 0.9980842911877394, F1: 0.8457792207792209, Fit Time: 0.19971585273742676
-------------Test-------------
Accuracy:0.724, Recall:0.7242798353909465, Precision:0.9887640449438202, F1:0.836104513064133 


## Hepatitis

In [283]:
df_hepatitis=pd.read_csv('./data/hepatitis/clean_hepatitis.csv',index_col=0)

#Ordinal Encoding
df_hepatitis['age'] = df_hepatitis['age'].replace({"20-30":0,"30-40":1,"40-50":2,'50-60':3,"60-70":4,"70-80":5})
df_hepatitis['bilirubin'] = df_hepatitis['bilirubin'].replace({'0-0.5':0,'0.5-1':1,'1-1.5':2,'1.5-2':3,'2 & Above':4})
df_hepatitis['phosphate'] = df_hepatitis['phosphate'].replace({'0-50':0,'50-100':1,'100-150':2,'150-200':3,'200-250':4,'250 & Above':5})
df_hepatitis['sgot'] = df_hepatitis['sgot'].replace({'0-100':0,'100-200':1,'200-300':2,'300 & Above':3})
df_hepatitis['albumin'] = df_hepatitis['albumin'].replace({'2-2.5':0,'2.5-3':1,'3-3.5':2,'3.5-4':3,'4-4.5':4,'4.5-5':5})
df_hepatitis['protime'] = df_hepatitis['protime'].replace({'0-20':0,'20-40':1,'40-60':2,'60-80':3,'80-100':4})

#Train Test Split
train_df,test_df = train_test_split(df_hepatitis,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[1:]]
train_Y = train_df[train_df.columns[0]]
test_X = test_df[test_df.columns[1:]]
test_Y = test_df[test_df.columns[0]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label=2)
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label=2)
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label=2)

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label=2)
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label=2)
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label=2)
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')

-------------Train-------------
Accuracy: 0.9666666666666667, Recall: 0.9622641509433962, Precision: 1.0, F1: 0.9807692307692307, Fit Time: 0.19133901596069336
-------------Test-------------
Accuracy:0.9, Recall:0.8888888888888888, Precision:1.0, F1:0.9411764705882353 


In [206]:
df_hepatitis['class'].value_counts()

2    67
1    13
Name: class, dtype: int64

## Mobile

In [296]:
df_mobile = pd.read_csv('./data/mobile/clean_mobile.csv',index_col=0)

#Ordinal Encoding
df_mobile['battery_power']=df_mobile['battery_power'].replace({'400-800':0,'800-1200':1,'1200-1600':2,'1600-2000':3})
df_mobile['clock_speed']=df_mobile['clock_speed'].replace({'0.5-1.0':0,'1.0-1.5':1,'1.5-2.0':2,'2.0-2.5':3,'2.5-3.0':4})
df_mobile['fc']=df_mobile['fc'].replace({'0.0-5.0':0,"5.0-10.0":1,'10.0-15.0':2,'15.0-20.0':3})
df_mobile['int_memory']=df_mobile['int_memory'].replace({'0-20':0,'20-40':1,'40-60':2,'60-80':3})
df_mobile['m_dep']=df_mobile['m_dep'].replace({'0-0.2':0,'0.2-0.4':1,'0.4-0.6':2,'0.6-0.8':3,'0.8-1.0':4})
df_mobile['mobile_wt']=df_mobile['mobile_wt'].replace({'80-120':0,'120-160':1,'160-200':2})
df_mobile['pc']=df_mobile['pc'].replace({'0.0-5.0':0,'5.0-10.0':1,'10.0-15.0':2,'15.0-20.0':3})
df_mobile['px_height']=df_mobile['px_height'].replace({'0-500':0,'500-1000':1,'1000-1500':2,'1500-2000':3})
df_mobile['px_width']=df_mobile['px_width'].replace({'400-800':0,'800-1200':1,'1200-1600':2,'1600-2000':3})
df_mobile['ram']=df_mobile['ram'].replace({'0-1000':0,'1000-2000':1,'2000-3000':2,'3000-4000':3})
df_mobile['sc_h']=df_mobile['sc_h'].replace({'5-8':0,'8-11':1,'11-14':2,'14-17':3,'17-20':4})
df_mobile['sc_w']=df_mobile['sc_w'].replace({'0.0-5.0':0,'5.0-10.0':1,'10.0-15.0':2,'15.0-20.0':3})
df_mobile['talk_time']=df_mobile['talk_time'].replace({'0.0-5.0':0,'5.0-10.0':1,'10.0-15.0':2,'15.0-20.0':3})

#Train Test Split
train_df,test_df = train_test_split(df_mobile,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,average='micro')
train_precision=precision_score(clf.predict(train_X),train_Y,average='micro')
train_f1=f1_score(clf.predict(train_X),train_Y,average='micro')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,average='micro')
test_precision=precision_score(clf.predict(test_X),test_Y,average='micro')
test_f1=f1_score(clf.predict(test_X),test_Y,average='micro')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')




-------------Train-------------
Accuracy: 0.8073333333333333, Recall: 0.8073333333333333, Precision: 0.8073333333333333, F1: 0.8073333333333333, Fit Time: 0.19645071029663086
-------------Test-------------
Accuracy:0.766, Recall:0.766, Precision:0.766, F1:0.766 


## Stroke

In [281]:
df_stroke=pd.read_csv('./data/stroke/clean_stroke.csv',index_col=0)

#Ordinal Encoding/One Hot Encoding
encoded_df = pd.DataFrame()
df_stroke['stroke']=df_stroke['stroke'].astype('int')
encoded_df['age']=df_stroke['age'].replace({'0-20':0,'20-40':1,'40-60':2,'60 & Above':3})
encoded_df['ever_married']=df_stroke['ever_married'].replace({"No":0,'Yes':1})
encoded_df['Residence_type']=df_stroke['Residence_type'].replace({'Rural':0,'Urban':1})
encoded_df['avg_glucose_level']=df_stroke['avg_glucose_level'].replace({'50-100':0,'100-150':1,'150-200':2,'200-250':3,'250-300':4})
encoded_df['bmi']=df_stroke['bmi'].replace({"10-20":0,'20-30':1,'30-40':2,'40-50':3,'50-60':4,'60-70':5})
encoded_df['hypertension']=df_stroke['hypertension']
encoded_df['heart_disease']=df_stroke['heart_disease']
encoded_df = encoded_df.merge(pd.get_dummies(df_stroke['work_type']),left_index=True,right_index=True).merge(pd.get_dummies(df_stroke['smoking_status']),left_index=True,right_index=True).merge(df_stroke['gender'],left_index=True,right_index=True)
encoded_df['stroke']=df_stroke['stroke']

#Train test split
train_df,test_df=train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label=0)
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label=0)
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label=0)

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label=0)
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label=0)
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label=0)
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')





-------------Train-------------
Accuracy: 0.8954489544895449, Recall: 0.8950617283950617, Precision: 1.0, F1: 0.9446254071661238, Fit Time: 0.21444106101989746
-------------Test-------------
Accuracy:0.8970588235294118, Recall:0.8970588235294118, Precision:1.0, F1:0.9457364341085273 


In [254]:
encoded_df['stroke'].value_counts()

0    969
1    116
Name: stroke, dtype: int64

## Tic-Tac-Toe

In [280]:
df_ttt=pd.read_csv('./data/tic_tac_toe/clean_tic-tac-toe.csv',index_col=0)

#One hot encoding
encoded_df=pd.DataFrame()
encoded_df['class'] = df_ttt['class']
encoded_df = encoded_df.merge(pd.get_dummies(df_ttt['top-left']).rename(columns={'b':'tl-b','o':'tl-o','x':'tl-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['top-middle']).rename(columns={'b':'tm-b','o':'tm-o','x':'tm-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['top-right']).rename(columns={'b':'tr-b','o':'tr-o','x':'tr-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['middle-left']).rename(columns={'b':'ml-b','o':'ml-o','x':'ml-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['middle-middle']).rename(columns={'b':'mm-b','o':'mm-o','x':'mm-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['middle-right']).rename(columns={'b':'mr-b','o':'mr-o','x':'mr-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['bottom-left']).rename(columns={'b':'bl-b','o':'bl-o','x':'bl-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['bottom-middle']).rename(columns={'b':'bm-b','o':'bm-o','x':'bm-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['bottom-right']).rename(columns={'b':'br-b','o':'br-o','x':'br-x'}),left_index=True,right_index=True)

#Train Test Split
train_df,test_df = train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[1:]]
train_Y = train_df[train_df.columns[0]]
test_X = test_df[test_df.columns[1:]]
test_Y = test_df[test_df.columns[0]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label='positive')
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label='positive')
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label='positive')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label='positive')
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label='positive')
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label='positive')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')

-------------Train-------------
Accuracy: 0.8161559888579387, Recall: 0.7802385008517888, Precision: 0.9934924078091106, F1: 0.8740458015267176, Fit Time: 0.1949009895324707
-------------Test-------------
Accuracy:0.8125, Recall:0.7941176470588235, Precision:0.9818181818181818, F1:0.8780487804878049 


## Zoo

In [297]:
df_zoo =pd.read_csv('./data/zoo/clean_zoo.csv',index_col=0)


#Train Test Split
train_df,test_df = train_test_split(df_zoo,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,average='micro')
train_precision=precision_score(clf.predict(train_X),train_Y,average='micro')
train_f1=f1_score(clf.predict(train_X),train_Y,average='micro')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,average='micro')
test_precision=precision_score(clf.predict(test_X),test_Y,average='micro')
test_f1=f1_score(clf.predict(test_X),test_Y,average='micro')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')




-------------Train-------------
Accuracy: 1.0, Recall: 1.0, Precision: 1.0, F1: 1.0, Fit Time: 0.22010493278503418
-------------Test-------------
Accuracy:0.9615384615384616, Recall:0.9615384615384616, Precision:0.9615384615384616, F1:0.9615384615384616 
