# Meta Methods applied to the ionosphere data set

In [1]:
# Import libraries 

import numpy as np    # Numeric and matrix computation
import pandas as pd   # Optional: good package for manipulating data 
import sklearn as sk  # Package with learning algorithms implemented

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data"
df = pd.read_csv(url,header =None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [2]:
# No preprocessing needed. Numerical and scaled data
# Separate data from labels

y=df[34].values
X=df.values[:,0:34]

## Voting scheme

In [3]:
from sklearn.model_selection import cross_val_score
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

cv=50

clf1 = GaussianNB()

params = {'n_neighbors':list(range(1,30,2)), 'weights':('distance','uniform')}
knc = KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=cv,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
clf.fit(X, y)
print("Best Params fo Knn=",clf.best_params_, "Accuracy=", clf.best_score_)
parval=clf.best_params_
clf2 = KNeighborsClassifier(n_neighbors=parval['n_neighbors'],weights=parval['weights'])

clf3 = DecisionTreeClassifier(criterion='entropy')


for clf, label in zip([clf1, clf2, clf3], ['Naive Bayes','Knn (3)', 'Dec. Tree', ]):
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), label))
    

Best Params fo Knn= {'weights': 'distance', 'n_neighbors': 1} Accuracy= 0.863247863248
Accuracy: 0.892 [Naive Bayes]
Accuracy: 0.868 [Knn (3)]
Accuracy: 0.893 [Dec. Tree]


In [4]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn3', clf2), ('dt', clf3)], voting='hard')
scores = cross_val_score(eclf, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean() , "Majority Voting"))
    

Accuracy: 0.927 [Majority Voting]


In [5]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn3', clf2), ('dt', clf3)],voting='soft', weights=[2,1,2])
scores = cross_val_score(eclf, X, y, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean(), "Weighted Voting"))

Accuracy: 0.924 [Weighted Voting]


## Bagging

In [6]:
from sklearn.ensemble import BaggingClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))
    
print()
for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=nest,max_features=0.35), X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.845 [1]
Accuracy: 0.880 [2]
Accuracy: 0.918 [5]
Accuracy: 0.898 [10]
Accuracy: 0.921 [20]
Accuracy: 0.926 [50]
Accuracy: 0.925 [100]
Accuracy: 0.933 [200]

Accuracy: 0.851 [1]
Accuracy: 0.879 [2]
Accuracy: 0.939 [5]
Accuracy: 0.921 [10]
Accuracy: 0.932 [20]
Accuracy: 0.938 [50]
Accuracy: 0.939 [100]
Accuracy: 0.937 [200]


## Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(RandomForestClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.873 [1]
Accuracy: 0.901 [2]
Accuracy: 0.940 [5]
Accuracy: 0.938 [10]
Accuracy: 0.939 [20]
Accuracy: 0.940 [50]
Accuracy: 0.948 [100]
Accuracy: 0.940 [200]


In [9]:
from sklearn.ensemble import ExtraTreesClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(ExtraTreesClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.895 [1]
Accuracy: 0.891 [2]
Accuracy: 0.909 [5]
Accuracy: 0.932 [10]
Accuracy: 0.937 [20]
Accuracy: 0.953 [50]
Accuracy: 0.948 [100]
Accuracy: 0.948 [200]


## Boosting

In [8]:
from sklearn.ensemble import AdaBoostClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(AdaBoostClassifier(n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.821 [1]
Accuracy: 0.896 [2]
Accuracy: 0.906 [5]
Accuracy: 0.911 [10]
Accuracy: 0.917 [20]
Accuracy: 0.947 [50]
Accuracy: 0.942 [100]
Accuracy: 0.937 [200]


In [9]:
from sklearn.ensemble import AdaBoostClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),n_estimators=nest), X, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.868 [1]
Accuracy: 0.892 [2]
Accuracy: 0.919 [5]
Accuracy: 0.922 [10]
Accuracy: 0.945 [20]
Accuracy: 0.940 [50]
Accuracy: 0.942 [100]
Accuracy: 0.938 [200]
