In [161]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, RepeatedKFold, train_test_split
from sklearn.metrics import accuracy_score

In [162]:
names1 = ["Age","workclass","fnlwgt","education","education-num", "marital-status","occupation", "relationship", "race", "sex","capital-gain","capital-loss","hours-per-week","native-country",""]
dataset1 = pd.read_csv(r"adult.data", names=names1)
dataset1.reset_index(drop=True,inplace=True)
dataset1['sex'].replace({'M':1, 'F':0,'I':0 }, inplace=True)
X = dataset1.values[0:,0:13]
Y = dataset1.values[:,13]
dataset1

Unnamed: 0,Age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Unnamed: 15
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### 

In [163]:
result = dataset1.select_dtypes(include='number')
X1 = result.values[0:,0:5]
Y1 = result.values[:,5]
result

Unnamed: 0,Age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
32556,27,257302,12,0,0,38
32557,40,154374,9,0,0,40
32558,58,151910,9,0,0,40
32559,22,201490,9,0,0,20


In [164]:
Gini = DecisionTreeClassifier(criterion='gini')
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Gini, X1, Y1, scoring='accuracy', cv=CV, n_jobs = -1)
print("Dataset 1 Gini without pruning & Repeated 10 X 10 folds:")
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 1 Gini without pruning & Repeated 10 X 10 folds:
Accuracy:  25.386803060352342
Standard Deviation:  0.008118381610183472


In [165]:
Gini = DecisionTreeClassifier(criterion='gini', ccp_alpha = 0.015)
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Gini, X1, Y1, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 1 Gini with pruning & Repeated 10 x 10 folds:')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 1 Gini with pruning & Repeated 10 x 10 folds:
Accuracy:  46.73381580704271
Standard Deviation:  0.009252818665462475


In [166]:
Gini = DecisionTreeClassifier(criterion='gini')
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=5)
Gini.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 1 Gini without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 1 Gini without pruning & Holdout Approach
Accuracy :  24.966731497594434


In [167]:
Gini = DecisionTreeClassifier(criterion='gini', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=5)
Gini.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print("Dataset 1 Gini with pruning & Repeated 10 X 10 folds:")
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 1 Gini with pruning & Repeated 10 X 10 folds:
Accuracy :  46.862524311597916


In [168]:
Entropy = DecisionTreeClassifier(criterion='entropy')
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Entropy, X1, Y1, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 1 Entropy without pruning & Repeated 10 X 10 folds')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 1 Entropy without pruning & Repeated 10 X 10 folds
Accuracy:  25.702225559916688
Standard Deviation:  0.007943086455284845


In [169]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Entropy, X1, Y1, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 1 Entropy with pruning & Repeated 10 X 10 folds')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 1 Entropy with pruning & Repeated 10 X 10 folds
Accuracy:  46.73381580704271
Standard Deviation:  0.009252818665462475


In [170]:
Entropy = DecisionTreeClassifier(criterion='entropy')
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=5)
Entropy.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 1 Entropy without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 1 Entropy without pruning & Holdout Approach
Accuracy :  46.862524311597916


In [171]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=5)
Entropy.fit(X_train, Y_train)
print('Dataset 1 Entropy with pruning & Holdout Approach')
predictions = Gini.predict(X_test)
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 1 Entropy with pruning & Holdout Approach
Accuracy :  46.862524311597916


In [172]:
names = ["Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"]
dataset = pd.read_csv(r"abalone.data", names=names)
dataset['Sex'].replace({'M':1, 'F':0, 'I':0}, inplace=True)
X = dataset.values[:,0:8]
Y = dataset.values[:,8]
dataset

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,1,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,0,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,1,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,0,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,0,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,1,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,1,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,0,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [173]:
Gini = DecisionTreeClassifier(criterion='gini')
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Gini, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print("Dataset 2 Gini without pruning & Repeated 10 X 10 folds:")
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 2 Gini without pruning & Repeated 10 X 10 folds:
Accuracy:  19.66755590742716
Standard Deviation:  0.01975596333293374


In [174]:
Gini = DecisionTreeClassifier(criterion='gini', ccp_alpha = 0.015)
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Gini, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print("Dataset 2 Gini with pruning & Repeated 10 X 10 folds:")
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 2 Gini with pruning & Repeated 10 X 10 folds:
Accuracy:  20.46697187704382
Standard Deviation:  0.01648496720200174


In [175]:
Gini = DecisionTreeClassifier(criterion='gini')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Gini.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 2 Gini without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 2 Gini without pruning & Holdout Approach
Accuracy :  18.740031897926634


In [176]:
Gini = DecisionTreeClassifier(criterion='gini', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Gini.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 2 Gini with pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 2 Gini with pruning & Holdout Approach
Accuracy :  20.73365231259968


In [177]:
Entropy = DecisionTreeClassifier(criterion='entropy')
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Entropy, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 2 Entropy without pruning & Repeated 10 X 10 folds')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 2 Entropy without pruning & Repeated 10 X 10 folds
Accuracy:  19.825376062786134
Standard Deviation:  0.01909953643048287


In [178]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Entropy, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 2 Entropy with pruning & Repeated 10 X 10 folds')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 2 Entropy with pruning & Repeated 10 X 10 folds
Accuracy:  26.078792468417618
Standard Deviation:  0.02121339185161093


In [179]:
Entropy = DecisionTreeClassifier(criterion='entropy')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Entropy.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 2 Entropy without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 2 Entropy without pruning & Holdout Approach
Accuracy :  20.73365231259968


In [180]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Entropy.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 2 Entropy without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 2 Entropy without pruning & Holdout Approach
Accuracy :  20.73365231259968


In [181]:
names = ["Class_Name", "Left_Weight", "Left_Distance", "Right_Weight"," Right_Distance"]
dataset1 = pd.read_csv("balance-scale.data", names=names)
dataset1["Class_Name"].replace({'B':1, 'R':0, 'L':2}, inplace=True)
dataset1 = dataset1[["Left_Weight", "Left_Distance", "Right_Weight"," Right_Distance", "Class_Name"]]
X = dataset1.values[:,0:4]
Y = dataset1.values[:,4]
dataset1

Unnamed: 0,Left_Weight,Left_Distance,Right_Weight,Right_Distance,Class_Name
0,1,1,1,1,1
1,1,1,1,2,0
2,1,1,1,3,0
3,1,1,1,4,0
4,1,1,1,5,0
...,...,...,...,...,...
620,5,5,5,1,2
621,5,5,5,2,2
622,5,5,5,3,2
623,5,5,5,4,2


In [182]:

Gini = DecisionTreeClassifier(criterion='gini')
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Gini, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print("Dataset 3 Gini without pruning & Repeated 10 X 10 folds:")
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 3 Gini without pruning & Repeated 10 X 10 folds:
Accuracy:  77.72811059907835
Standard Deviation:  0.05331467080940778


In [183]:

Gini = DecisionTreeClassifier(criterion='gini', ccp_alpha = 0.015)
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Gini, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print("Dataset 3 Gini with pruning & Repeated 10 X 10 folds:")
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 3 Gini with pruning & Repeated 10 X 10 folds:
Accuracy:  75.61776753712239
Standard Deviation:  0.05380786761942512


In [184]:
Gini = DecisionTreeClassifier(criterion='gini')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Gini.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 3 Gini without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 3 Gini without pruning & Holdout Approach
Accuracy :  81.38297872340425


In [185]:
Gini = DecisionTreeClassifier(criterion='gini', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Gini.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 3 Gini with pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 3 Gini with pruning & Holdout Approach
Accuracy :  76.06382978723404


In [186]:
Entropy = DecisionTreeClassifier(criterion='entropy')
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Entropy, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 3 Entropy without pruning & Repeated 10 X 10 folds')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 3 Entropy without pruning & Repeated 10 X 10 folds
Accuracy:  77.26164874551971
Standard Deviation:  0.05118417103857723


In [187]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
CV = RepeatedKFold(n_splits = 10, n_repeats = 10, random_state = 1)
Acc_n_Std = cross_val_score(Entropy, X, Y, scoring='accuracy', cv=CV, n_jobs = -1)
print('Dataset 3 Entropy with pruning & Repeated 10 X 10 folds')
print('Accuracy: ',100*np.mean(Acc_n_Std))
print('Standard Deviation: ',np.std(Acc_n_Std))

Dataset 3 Entropy with pruning & Repeated 10 X 10 folds
Accuracy:  72.81643625192011
Standard Deviation:  0.04658670012710971


In [188]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Entropy.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 3 Entropy without pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 3 Entropy without pruning & Holdout Approach
Accuracy :  76.06382978723404


In [189]:
Entropy = DecisionTreeClassifier(criterion='entropy', ccp_alpha = 0.015)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5)
Entropy.fit(X_train, Y_train)
predictions = Gini.predict(X_test)
print('Dataset 3 Entropy with pruning & Holdout Approach')
print ("Accuracy : ",accuracy_score(Y_test,predictions)*100)

Dataset 3 Entropy with pruning & Holdout Approach
Accuracy :  76.06382978723404
