In [1]:
from scipy.io import arff
import pandas as pd

In [3]:
data = arff.loadarff(r"../artificial/2d-3c-no123.arff")

In [4]:
df = pd.DataFrame(data[0])
df

Unnamed: 0,a0,a1,class
0,-0.499261,-0.061236,b'0'
1,-1.513690,0.265446,b'0'
2,-1.603210,0.362039,b'0'
3,-1.699280,-0.384246,b'0'
4,-2.299290,-0.358038,b'0'
...,...,...,...
710,1.763780,0.291525,b'2'
711,0.534542,-0.221663,b'2'
712,1.302740,0.366262,b'2'
713,0.016068,0.213499,b'2'


In [5]:
df.to_csv('data.csv', index = False)

In [6]:
df1 = pd.read_csv('data.csv')
df1

Unnamed: 0,a0,a1,class
0,-0.499261,-0.061236,b'0'
1,-1.513690,0.265446,b'0'
2,-1.603210,0.362039,b'0'
3,-1.699280,-0.384246,b'0'
4,-2.299290,-0.358038,b'0'
...,...,...,...
710,1.763780,0.291525,b'2'
711,0.534542,-0.221663,b'2'
712,1.302740,0.366262,b'2'
713,0.016068,0.213499,b'2'


In [7]:
df1['corrected_class'] = df1['class'].apply(lambda x : x.split("'")[1])
df1

Unnamed: 0,a0,a1,class,corrected_class
0,-0.499261,-0.061236,b'0',0
1,-1.513690,0.265446,b'0',0
2,-1.603210,0.362039,b'0',0
3,-1.699280,-0.384246,b'0',0
4,-2.299290,-0.358038,b'0',0
...,...,...,...,...
710,1.763780,0.291525,b'2',2
711,0.534542,-0.221663,b'2',2
712,1.302740,0.366262,b'2',2
713,0.016068,0.213499,b'2',2


In [8]:
df2  =df1.drop('class', axis = 'columns')
df2

Unnamed: 0,a0,a1,corrected_class
0,-0.499261,-0.061236,0
1,-1.513690,0.265446,0
2,-1.603210,0.362039,0
3,-1.699280,-0.384246,0
4,-2.299290,-0.358038,0
...,...,...,...
710,1.763780,0.291525,2
711,0.534542,-0.221663,2
712,1.302740,0.366262,2
713,0.016068,0.213499,2


In [9]:
input_set = df2.drop('corrected_class', axis = 'columns')
input_set

Unnamed: 0,a0,a1
0,-0.499261,-0.061236
1,-1.513690,0.265446
2,-1.603210,0.362039
3,-1.699280,-0.384246
4,-2.299290,-0.358038
...,...,...
710,1.763780,0.291525
711,0.534542,-0.221663
712,1.302740,0.366262
713,0.016068,0.213499


In [10]:
target_set = df2.corrected_class
target_set

0      0
1      0
2      0
3      0
4      0
      ..
710    2
711    2
712    2
713    2
714    2
Name: corrected_class, Length: 715, dtype: object

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X,x,Y,y = train_test_split(input_set, target_set, train_size=0.5)

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
model = LogisticRegression(max_iter=500)

In [15]:
model.fit(X,Y)

In [16]:
model.score(x,y)

0.9972067039106145

# Using Cross-validation 

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
lr = cross_val_score(LogisticRegression(max_iter=500), input_set,target_set)
lr

array([0.98601399, 0.99300699, 1.        , 1.        , 0.99300699])

In [20]:
dt = cross_val_score(DecisionTreeClassifier(), input_set,target_set )
dt

array([0.98601399, 0.97902098, 0.97202797, 0.98601399, 0.99300699])

In [21]:
sv = cross_val_score(SVC(), input_set,target_set )
sv

array([0.98601399, 0.98601399, 1.        , 1.        , 0.99300699])

In [22]:
rf = cross_val_score(RandomForestClassifier(n_estimators=40), input_set,target_set)
rf

array([0.99300699, 0.97902098, 1.        , 1.        , 0.99300699])

In [23]:
import numpy as np

In [24]:
np.average(lr)

0.9944055944055945

In [25]:
np.average(dt)

0.9832167832167833

In [26]:
np.average(sv)

0.993006993006993

In [27]:
np.average(rf)

0.993006993006993

# Best score is from Logistic Regression Classifier which is 99.44%

Although increasing the train_size or decreasing the test_size and using training sets for model.fit, leads to exceptional score of 1.

# USING GridSearchCV

GridSearchCV on individual estimators

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
clf1 = GridSearchCV(LogisticRegression(max_iter = 500),{
        'fit_intercept' : [True, False],
        'random_state': [1,10,30,50,100],
        'solver': ['liblinear', 'saga','sag']
}, cv= 10)
clf1.fit(X, Y)

In [30]:
df3 = pd.DataFrame(clf1.cv_results_)
df3[['param_fit_intercept','param_random_state','param_solver', 'mean_test_score']]

Unnamed: 0,param_fit_intercept,param_random_state,param_solver,mean_test_score
0,True,1,liblinear,0.994286
1,True,1,saga,0.991587
2,True,1,sag,0.991587
3,True,10,liblinear,0.994286
4,True,10,saga,0.991587
5,True,10,sag,0.991587
6,True,30,liblinear,0.994286
7,True,30,saga,0.991587
8,True,30,sag,0.991587
9,True,50,liblinear,0.994286


In [31]:
clf1.best_estimator_

In [32]:
clf1.best_score_

0.9942857142857143

In [33]:
clf2 = GridSearchCV(DecisionTreeClassifier(),{
        'random_state': [1,5,10,30,50,100],
        'criterion': ['gini', 'entropy'],
        'splitter':['best', 'random'],
        'min_samples_split': [10,50,100 ]
}, cv= 10)
clf2.fit(X, Y)

In [34]:
df4 = pd.DataFrame(clf2.cv_results_)
df4[['param_random_state','param_criterion','param_splitter','param_min_samples_split', 'mean_test_score']]

Unnamed: 0,param_random_state,param_criterion,param_splitter,param_min_samples_split,mean_test_score
0,1,gini,best,10,0.980317
1,1,gini,random,10,0.969127
2,5,gini,best,10,0.983175
3,5,gini,random,10,0.974683
4,10,gini,best,10,0.980317
...,...,...,...,...,...
67,30,entropy,random,100,0.876746
68,50,entropy,best,100,0.969048
69,50,entropy,random,100,0.910476
70,100,entropy,best,100,0.969048


In [35]:
clf2.best_estimator_

In [36]:
clf2.best_score_

0.9831746031746033

In [37]:
clf3 = GridSearchCV(SVC(gamma='auto'),{
    'C': [10, 20, 50 ,100, 150],
    'kernel' : ['rbf','linear']
}, cv= 10)
clf3.fit(X, Y)

In [38]:
df5 = pd.DataFrame(clf3.cv_results_)
df5[['param_C','param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.991587
1,10,linear,0.991587
2,20,rbf,0.98873
3,20,linear,0.98873
4,50,rbf,0.991587
5,50,linear,0.991587
6,100,rbf,0.991587
7,100,linear,0.991587
8,150,rbf,0.991587
9,150,linear,0.991587


In [39]:
clf3.best_estimator_

In [40]:
clf3.best_score_

0.9915873015873016

In [41]:
clf4 = GridSearchCV(RandomForestClassifier(),{
        'n_estimators': [1,10,30,50,100]
}, cv= 10)
clf4.fit(X, Y)

In [42]:
df6 = pd.DataFrame(clf4.cv_results_)
df6[['param_n_estimators', 'mean_test_score']]

Unnamed: 0,param_n_estimators,mean_test_score
0,1,0.963571
1,10,0.985952
2,30,0.98873
3,50,0.991587
4,100,0.98873


In [43]:
clf4.best_score_

0.9915873015873016

In [44]:
clf4.best_estimator_

In [45]:
clf4.best_params_

{'n_estimators': 50}

In [46]:
clf4.best_index_

3

GridSearchCV on individual estimators fit with training input and target set gives SVC as best estimator with a score of 1.

GridSearchCV WITHOUT Pipeline

In [47]:
from sklearn.model_selection import ShuffleSplit

In [48]:
def best_model_using_gridsearchCV(X,Y):
    algos={
        'linear regression' : {
            'model': LogisticRegression(max_iter=500),
            'parameters' : {
                'fit_intercept' : [True, False],
                'random_state': [1,10,30,50,100],
                'solver': ['liblinear', 'saga','sag']
            }
        },
        'decision tree': {
            'model' : DecisionTreeClassifier(),
            'parameters' :{
                'random_state': [1,5,10,30,50,100],
                'criterion': ['gini', 'entropy'],
                'splitter':['best', 'random'],
                'min_samples_split': [10,50,100]
            }
        },
        'svc' : {
            'model': SVC(),
            'parameters': {
                'C': [10, 20, 50 ,100, 150],
                'kernel' : ['rbf','linear']
            }
        },
        'random forest' :{
            'model': RandomForestClassifier(),
            'parameters' :{
                'random_state': [5,10],
                'criterion' : [ 'gini', 'entropy'],
                'n_estimators': [1,5,10],

            }
        }
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=10, test_size=0.25)
    
    for algos_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['parameters'], cv=cv, return_train_score = False)
        gs.fit(X,Y)
        scores.append({
            'model' : algos_name,
            'best_score': gs.best_score_ ,
            'best_parameter': gs.best_params_
        })
    
    return pd.DataFrame(scores, columns= ['model', 'best_score', 'best_parameter'])

In [49]:
best_model_using_gridsearchCV(input_set, target_set)

Unnamed: 0,model,best_score,best_parameter
0,linear regression,0.995531,"{'fit_intercept': False, 'random_state': 1, 's..."
1,decision tree,0.989385,"{'criterion': 'gini', 'min_samples_split': 10,..."
2,svc,0.996648,"{'C': 10, 'kernel': 'rbf'}"
3,random forest,0.991061,"{'criterion': 'gini', 'n_estimators': 10, 'ran..."


GridSearchCV on total input and target set gives SVC as best estimator with score 99.66%

In [50]:
best_model_using_gridsearchCV(X, Y)

Unnamed: 0,model,best_score,best_parameter
0,linear regression,0.993333,"{'fit_intercept': True, 'random_state': 1, 'so..."
1,decision tree,0.98,"{'criterion': 'gini', 'min_samples_split': 10,..."
2,svc,0.988889,"{'C': 10, 'kernel': 'rbf'}"
3,random forest,0.985556,"{'criterion': 'gini', 'n_estimators': 5, 'rand..."


GridSearchCV on training input and target set also gives SVC as best estimator with score 99.77%

In [51]:
best_model_using_gridsearchCV(x,y)

Unnamed: 0,model,best_score,best_parameter
0,linear regression,1.0,"{'fit_intercept': False, 'random_state': 1, 's..."
1,decision tree,0.982222,"{'criterion': 'gini', 'min_samples_split': 10,..."
2,svc,0.998889,"{'C': 10, 'kernel': 'linear'}"
3,random forest,0.987778,"{'criterion': 'entropy', 'n_estimators': 10, '..."


GridSearchCV on testing input and target set gives and Logistic Regression and SVC both as best estimator with score 99.11%

GridSearchCV WITH Pipeline

In [52]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report

In [53]:
model_params = {
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'logisticregression__fit_intercept' : [True, False],
            'logisticregression__random_state': [1,10,30,50,100],
            'logisticregression__solver': ['liblinear', 'saga','sag']
        }
    },
     'decision tree': {
            'model' : DecisionTreeClassifier(),
            'params' :{
                'decisiontreeclassifier__random_state': [1,5,10,30,50,100],
                'decisiontreeclassifier__criterion': ['gini', 'entropy'],
                'decisiontreeclassifier__splitter':['best', 'random'],
                'decisiontreeclassifier__min_samples_split': [10,50,100]
            }
     },
     'svm': {
        'model': SVC(gamma='auto',probability=True),
        'params' : {
            'svc__C': [10, 20, 50 ,100, 150],
            'svc__kernel' : ['rbf','linear']
        }  
    },
     'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'randomforestclassifier__n_estimators': [1,5,10],
            'randomforestclassifier__random_state': [5,10],
            'randomforestclassifier__criterion' : [ 'gini', 'entropy'],
        }
    }
}

In [54]:
scores = []
best_estimators = {}
import pandas as pd
for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf =  GridSearchCV(pipe, mp['params'], cv=2, return_train_score=False)
    clf.fit(X, Y)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,logistic_regression,0.991604,"{'logisticregression__fit_intercept': True, 'l..."
1,decision tree,0.983209,"{'decisiontreeclassifier__criterion': 'gini', ..."
2,svm,0.988795,"{'svc__C': 10, 'svc__kernel': 'rbf'}"
3,random_forest,0.986018,"{'randomforestclassifier__criterion': 'gini', ..."


GridSearchCV on training input and target set also gives SVC as best estimator with score 99.71%