In [1]:
import numpy as np 
np.set_printoptions(threshold=10000,suppress=True) 
import pandas as pd
import warnings
import matplotlib.pyplot as plt 
warnings.filterwarnings('ignore')

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold,cross_val_score
import time
import xgboost as xgb
from sklearn.preprocessing import StandardScaler


clfs = {
    'RF': RandomForestClassifier(n_estimators=100, random_state=1), 
    'BAG': BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1),n_estimators=100,random_state=1),
    'ADA': AdaBoostClassifier(n_estimators=100,random_state=1),
    'ET': ExtraTreesClassifier(n_estimators=100,random_state=1),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'CART': DecisionTreeClassifier(criterion='gini',random_state=1),
    'ID3': DecisionTreeClassifier(criterion='entropy',random_state=1),
    'Stumb': DecisionTreeClassifier(criterion='gini',max_depth=1,random_state=1),
    'MLP': MLPClassifier(hidden_layer_sizes=(20,10),random_state=1)


}


def run_classifieurs(X,Y,clfs):
    kf = KFold(n_splits=10, shuffle=True, random_state=1) 
    for i in clfs:
        clf = clfs[i]
        debut=time.time()
        cv_acc = cross_val_score(clf, X, Y, cv=kf,scoring='accuracy')
        fin=time.time()
        print("Accuracy for {0} is: {1:.3f} +/- {2:.3f} en {3:.3f}s".format(i,
                                                                            np.mean(cv_acc),
                                                                            np.std(cv_acc),fin-debut))

## Application sur des données complexes (Numériques et catégorielles)

In [3]:
data_credit=pd.read_csv('./credit.data',sep='\t',header=None)
data_credit.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+


In [4]:
X=data_credit.iloc[:,:15].values
Y=data_credit.iloc[:,15].values

In [5]:
Y[Y=='+']=1
Y[Y=='-']=0
Y=Y.astype(int)

In [6]:
num_cols=[1,2,7,10,13,14]
cat_col=[0,3,4,5,6,8,9,11,12]

In [7]:
X_num=X[:,num_cols]
X_num[X_num == '?'] = np.nan
X_num = X_num.astype(float)
Y_num = Y[~np.isnan(X_num).any(axis=1)]
X_num = X_num[~np.isnan(X_num).any(axis=1)]

In [8]:
X_num.shape

(666, 6)

In [9]:
run_classifieurs(X_num,Y_num,clfs)

Accuracy for RF is: 0.778 +/- 0.036 en 1.104s
Accuracy for BAG is: 0.779 +/- 0.028 en 1.521s
Accuracy for ADA is: 0.787 +/- 0.040 en 0.933s
Accuracy for ET is: 0.782 +/- 0.040 en 0.817s
Accuracy for KNN is: 0.700 +/- 0.060 en 0.027s
Accuracy for CART is: 0.719 +/- 0.066 en 0.019s
Accuracy for ID3 is: 0.700 +/- 0.040 en 0.023s
Accuracy for Stumb is: 0.743 +/- 0.042 en 0.008s
Accuracy for MLP is: 0.683 +/- 0.036 en 2.801s


In [10]:
SS=StandardScaler()
X_num_norm=SS.fit_transform(X_num)

In [11]:
run_classifieurs(X_num_norm,Y_num,clfs)

Accuracy for RF is: 0.779 +/- 0.038 en 1.070s
Accuracy for BAG is: 0.779 +/- 0.028 en 1.455s
Accuracy for ADA is: 0.787 +/- 0.040 en 0.943s
Accuracy for ET is: 0.782 +/- 0.040 en 0.826s
Accuracy for KNN is: 0.755 +/- 0.045 en 0.024s
Accuracy for CART is: 0.721 +/- 0.065 en 0.020s
Accuracy for ID3 is: 0.700 +/- 0.040 en 0.024s
Accuracy for Stumb is: 0.743 +/- 0.042 en 0.009s
Accuracy for MLP is: 0.791 +/- 0.045 en 11.318s


In [12]:
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import OneHotEncoder

In [13]:
X_new_cat = np.copy(X[:, cat_col])

for col_id in range(len(cat_col)):
    unique_val, val_idx = np.unique(X_new_cat[:, col_id], return_inverse=True)
    X_new_cat[:, col_id] = val_idx
    #print(unique_val)

imp_cat = SimpleImputer(missing_values=0, strategy='most_frequent')
#imp_cat = KNNImputer(missing_values=0,n_neighbors=3)


X_new_cat[:, range(5)] = imp_cat.fit_transform(X_new_cat[:, range(5)])
X_new_cat_bin = OneHotEncoder().fit_transform(X_new_cat).toarray()
X_new_cat_bin.shape

(688, 40)

In [14]:
X_new_num = np.copy(X[:, num_cols])
X_new_num[X_new_num == '?'] = np.nan
X_new_num = X_new_num.astype(float)
imp_num = SimpleImputer(missing_values=np.nan, strategy='mean')
#imp_num = KNNImputer(missing_values=np.nan,n_neighbors=3)
X_new_num = imp_num.fit_transform(X_new_num)
X_new_num = StandardScaler().fit_transform(X_new_num)
X_new_num.shape


(688, 6)

In [15]:
X_final=np.concatenate((X_new_cat_bin,X_new_num),axis=1)

In [16]:
run_classifieurs(X_final,Y,clfs)

Accuracy for RF is: 0.874 +/- 0.039 en 1.055s
Accuracy for BAG is: 0.868 +/- 0.045 en 2.060s
Accuracy for ADA is: 0.839 +/- 0.042 en 1.113s
Accuracy for ET is: 0.866 +/- 0.043 en 0.848s
Accuracy for KNN is: 0.845 +/- 0.050 en 0.045s
Accuracy for CART is: 0.805 +/- 0.041 en 0.027s
Accuracy for ID3 is: 0.807 +/- 0.026 en 0.031s
Accuracy for Stumb is: 0.856 +/- 0.032 en 0.010s
Accuracy for MLP is: 0.855 +/- 0.046 en 20.080s
