In [61]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import time
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
import warnings
warnings.filterwarnings("ignore")

## breast_w

In [295]:
df_breast_w=pd.read_csv('./data/breast_w/clean_breast_w.csv',index_col=0)

train_df,test_df=train_test_split(df_breast_w,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y=test_df[test_df.columns[-1]]


#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label=4)
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label=4)
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label=4)

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label=4)
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label=4)
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label=4)
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Test Accuracy:{test_accuracy}, Test Recall:{test_recall}, Test Precision:{test_precision}, Test F1:{test_f1} ')


-------------Train-------------
Accuracy: 0.9765625, Recall: 0.9491525423728814, Precision: 0.9824561403508771, F1: 0.9655172413793103, Fit Time: 0.22424983978271484
-------------Test-------------
Test Accuracy:0.9590643274853801, Test Recall:0.9841269841269841, Test Precision:0.9117647058823529, Test F1:0.9465648854961831 


## Gender

In [287]:
df_gender = pd.read_csv('./data/gender/gender.csv',index_col=0)

#One Hot Encoding
encoded_df = pd.get_dummies(df_gender[df_gender.columns[0:-2]]).merge(df_gender[df_gender.columns[-1]],left_index=True,right_index=True)

#Train test split
train_df,test_df=train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label='M')
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label="M")
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label="M")

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label="M")
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label="M")
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label="M")
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')



-------------Train-------------
Accuracy: 0.8979591836734694, Recall: 0.8571428571428571, Precision: 0.96, F1: 0.9056603773584904, Fit Time: 0.19206023216247559
-------------Test-------------
Accuracy:0.5882352941176471, Recall:0.5555555555555556, Precision:0.625, F1:0.5882352941176471 


## German

In [21]:
df_german=pd.read_csv('./data/german/german.data',header=None)
df_german.columns=['status_of_existing_checking_account duration_(months) credit_history purpose credit_amount savings_account/bonds present_employment_since installment_rate personal_status_sex other_debtors present_residence property age other_installment_plans housing number_of_existing_credits job no._of_people_being_liable_to_provide_maintenance telephone foreign_worker class']
df_german[df_german.columns[0].split(' ')] = df_german.iloc[:,0].str.split(' ', expand=True)
df_german.drop(df_german.columns[0], axis=1, inplace=True)

df_german['age']=df_german['age'].astype('int')
df_german['duration_(months)']=df_german['duration_(months)'].astype('int')
df_german['credit_amount']=df_german['credit_amount'].astype('int')

#One Hot Encoding & Ordinal Encoding
encoded_df = pd.DataFrame()
encoded_df['status_of_existing_checking_account']=df_german['status_of_existing_checking_account'].replace({"A14":0,"A11":1,"A12":2,"A13":3})
encoded_df['savings_account/bonds']=df_german['savings_account/bonds'].replace({"A65":0,"A61":1,"A62":2,"A63":3,"A64":4})
encoded_df['present_employment_since']=df_german['present_employment_since'].replace({"A71":0,"A72":1,'A73':2,'A74':3,"A75":4})
encoded_df['job']=df_german['job'].replace({'A171':0,"A172":1,'A173':2,'A174':3})
encoded_df['age']=df_german['age']
encoded_df['duration_(months)']=df_german['duration_(months)']
encoded_df['credit_amount']=df_german['duration_(months)']
encoded_df= encoded_df.merge(pd.get_dummies(df_german['credit_history']).merge(pd.get_dummies(df_german['purpose']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['personal_status_sex']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['other_debtors']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['property']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['other_installment_plans']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['housing']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['telephone']),left_index=True,right_index=True).merge(pd.get_dummies(df_german['foreign_worker']),left_index=True,right_index=True),left_index=True,right_index=True)
encoded_df['class']=df_german['class']


#Train test split
train_df,test_df=train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label='1')
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label='1')
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label='1')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label='1')
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label='1')
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label='1')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')



-------------Train-------------
Accuracy: 0.7466666666666667, Recall: 0.7378223495702005, Precision: 0.9865900383141762, F1: 0.8442622950819673, Fit Time: 0.16454195976257324
-------------Test-------------
Accuracy:0.724, Recall:0.7339055793991416, Precision:0.9606741573033708, F1:0.8321167883211679 


## Hepatitis

In [16]:
df_hepatitis=pd.read_csv('./data/hepatitis/hepatitis.csv',header=None)

df_hepatitis.columns=['class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,big liver,firm liver,palpable spleen,spiders,ascites,varices,bilirubin,phosphate,sgot,albumin,protime,histology']
df_hepatitis[df_hepatitis.columns[0].split(',')] = df_hepatitis.iloc[:,0].str.split(',', expand=True)
df_hepatitis.drop(df_hepatitis.columns[0],axis=1,inplace=True)
df_hepatitis=df_hepatitis.replace('?',np.nan).dropna().reset_index(drop=True)
df_hepatitis['age']=df_hepatitis['age'].astype('int')
df_hepatitis['phosphate']=df_hepatitis['phosphate'].astype('int')
df_hepatitis['sgot']=df_hepatitis['sgot'].astype('int')
df_hepatitis['albumin']=df_hepatitis['albumin'].astype('float')
df_hepatitis['protime']=df_hepatitis['protime'].astype('float')
df_hepatitis['bilirubin']=df_hepatitis['bilirubin'].astype('float')

#Train Test Split
train_df,test_df = train_test_split(df_hepatitis,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[1:]]
train_Y = train_df[train_df.columns[0]]
test_X = test_df[test_df.columns[1:]]
test_Y = test_df[test_df.columns[0]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label='2')
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label='2')
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label='2')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label='2')
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label='2')
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label='2')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')

-------------Train-------------
Accuracy: 0.9833333333333333, Recall: 0.9807692307692307, Precision: 1.0, F1: 0.9902912621359222, Fit Time: 0.13033175468444824
-------------Test-------------
Accuracy:0.9, Recall:0.8888888888888888, Precision:1.0, F1:0.9411764705882353 


## Mobile

In [17]:
df_mobile=pd.read_csv('./data/mobile/train.csv')

#Train Test Split
train_df,test_df = train_test_split(df_mobile,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,average='micro')
train_precision=precision_score(clf.predict(train_X),train_Y,average='micro')
train_f1=f1_score(clf.predict(train_X),train_Y,average='micro')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,average='micro')
test_precision=precision_score(clf.predict(test_X),test_Y,average='micro')
test_f1=f1_score(clf.predict(test_X),test_Y,average='micro')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')




-------------Train-------------
Accuracy: 0.896, Recall: 0.896, Precision: 0.896, F1: 0.8960000000000001, Fit Time: 0.18169713020324707
-------------Test-------------
Accuracy:0.824, Recall:0.824, Precision:0.824, F1:0.824 


## Stroke

In [22]:
df_stroke=pd.read_csv('./data/stroke/clean_stroke.csv',index_col=0)

df_train=pd.read_csv('./data/stroke/train.csv')
df_test=pd.read_csv('./data/stroke/test.csv')
df_stroke=pd.concat([df_train,df_test])
# Drop all rows with Nan values and id column
df_stroke=df_stroke.drop(columns=['id']).dropna()

df_stroke=df_stroke.replace('*82','82')

#Converting data type from string to numerical
df_stroke['age']=df_stroke['age'].astype('int')

#Ordinal Encoding/One Hot Encoding
encoded_df = pd.DataFrame()
encoded_df['ever_married']=df_stroke['ever_married'].replace({"No":0,'Yes':1})
encoded_df['Residence_type']=df_stroke['Residence_type'].replace({'Rural':0,'Urban':1})
encoded_df['hypertension']=df_stroke['hypertension']
encoded_df['heart_disease']=df_stroke['heart_disease']
encoded_df['age']=df_stroke['age']
encoded_df['avg_glucose_level']=df_stroke['avg_glucose_level']
encoded_df['bmi']=df_stroke['bmi']

encoded_df = encoded_df.merge(pd.get_dummies(df_stroke['work_type']),left_index=True,right_index=True).merge(pd.get_dummies(df_stroke['smoking_status']),left_index=True,right_index=True).merge(df_stroke['gender'],left_index=True,right_index=True)
encoded_df['stroke']=df_stroke['stroke']

#Train test split
train_df,test_df=train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label=0)
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label=0)
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label=0)

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label=0)
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label=0)
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label=0)
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')

-------------Train-------------
Accuracy: 0.9015990159901599, Recall: 0.9006211180124224, Precision: 1.0, F1: 0.9477124183006536, Fit Time: 0.15999889373779297
-------------Test-------------
Accuracy:0.8970588235294118, Recall:0.8970588235294118, Precision:1.0, F1:0.9457364341085273 


## Tic-Tac-Toe

In [280]:
df_ttt=pd.read_csv('./data/tic_tac_toe/clean_tic-tac-toe.csv',index_col=0)

#One hot encoding
encoded_df=pd.DataFrame()
encoded_df['class'] = df_ttt['class']
encoded_df = encoded_df.merge(pd.get_dummies(df_ttt['top-left']).rename(columns={'b':'tl-b','o':'tl-o','x':'tl-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['top-middle']).rename(columns={'b':'tm-b','o':'tm-o','x':'tm-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['top-right']).rename(columns={'b':'tr-b','o':'tr-o','x':'tr-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['middle-left']).rename(columns={'b':'ml-b','o':'ml-o','x':'ml-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['middle-middle']).rename(columns={'b':'mm-b','o':'mm-o','x':'mm-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['middle-right']).rename(columns={'b':'mr-b','o':'mr-o','x':'mr-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['bottom-left']).rename(columns={'b':'bl-b','o':'bl-o','x':'bl-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['bottom-middle']).rename(columns={'b':'bm-b','o':'bm-o','x':'bm-x'}),left_index=True,right_index=True).merge(pd.get_dummies(df_ttt['bottom-right']).rename(columns={'b':'br-b','o':'br-o','x':'br-x'}),left_index=True,right_index=True)

#Train Test Split
train_df,test_df = train_test_split(encoded_df,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[1:]]
train_Y = train_df[train_df.columns[0]]
test_X = test_df[test_df.columns[1:]]
test_Y = test_df[test_df.columns[0]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,pos_label='positive')
train_precision=precision_score(clf.predict(train_X),train_Y,pos_label='positive')
train_f1=f1_score(clf.predict(train_X),train_Y,pos_label='positive')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,pos_label='positive')
test_precision=precision_score(clf.predict(test_X),test_Y,pos_label='positive')
test_f1=f1_score(clf.predict(test_X),test_Y,pos_label='positive')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')

-------------Train-------------
Accuracy: 0.8161559888579387, Recall: 0.7802385008517888, Precision: 0.9934924078091106, F1: 0.8740458015267176, Fit Time: 0.1949009895324707
-------------Test-------------
Accuracy:0.8125, Recall:0.7941176470588235, Precision:0.9818181818181818, F1:0.8780487804878049 


## Zoo

In [297]:
df_zoo =pd.read_csv('./data/zoo/clean_zoo.csv',index_col=0)


#Train Test Split
train_df,test_df = train_test_split(df_zoo,test_size=0.25,random_state=42)
train_X = train_df[train_df.columns[0:-2]]
train_Y = train_df[train_df.columns[-1]]
test_X = test_df[test_df.columns[0:-2]]
test_Y = test_df[test_df.columns[-1]]

#Calculating Fit Time and Train Accuracy
clf=RandomForestClassifier(max_depth=4,n_jobs=-1,random_state=41)
start=time.time()
clf.fit(train_X,train_Y)
end=time.time()
train_accuracy=accuracy_score(clf.predict(train_X),train_Y)
train_recall=recall_score(clf.predict(train_X),train_Y,average='micro')
train_precision=precision_score(clf.predict(train_X),train_Y,average='micro')
train_f1=f1_score(clf.predict(train_X),train_Y,average='micro')

#Mean Accuracy
test_accuracy=accuracy_score(clf.predict(test_X),test_Y)
test_recall=recall_score(clf.predict(test_X),test_Y,average='micro')
test_precision=precision_score(clf.predict(test_X),test_Y,average='micro')
test_f1=f1_score(clf.predict(test_X),test_Y,average='micro')
print('-------------Train-------------')
print(f'Accuracy: {train_accuracy}, Recall: {train_recall}, Precision: {train_precision}, F1: {train_f1}, Fit Time: {end-start}')
print('-------------Test-------------')
print(f'Accuracy:{test_accuracy}, Recall:{test_recall}, Precision:{test_precision}, F1:{test_f1} ')




-------------Train-------------
Accuracy: 1.0, Recall: 1.0, Precision: 1.0, F1: 1.0, Fit Time: 0.22010493278503418
-------------Test-------------
Accuracy:0.9615384615384616, Recall:0.9615384615384616, Precision:0.9615384615384616, F1:0.9615384615384616 
