# Voting Classifier

## Hard/Soft Voting 

In [6]:
import pandas as pd

In [8]:
data = pd.read_csv('Data.csv')
data.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
names = list(data.columns)[3:-1]
names

['CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

In [10]:
X = data.iloc[ : , 3:-1].values
y = data.iloc[ : , -1].values

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# X[ : , 1] = le.fit_transform(X[ : , 1])
X[ : , 2] = le.fit_transform(X[ : , 2])

In [12]:
# import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoders', OneHotEncoder() , [1])], remainder = 'passthrough')

In [13]:
X = ct.fit_transform(X)

In [14]:
X

array([[1.0, 0.0, 0.0, ..., 1, 1, 101348.88],
       [0.0, 0.0, 1.0, ..., 0, 1, 112542.58],
       [1.0, 0.0, 0.0, ..., 1, 0, 113931.57],
       ...,
       [1.0, 0.0, 0.0, ..., 0, 1, 42085.58],
       [0.0, 1.0, 0.0, ..., 1, 0, 92888.52],
       [1.0, 0.0, 0.0, ..., 1, 0, 38190.78]], dtype=object)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)


In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [17]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [18]:
lg_clf = LogisticRegression()
rnd_clf = RandomForestClassifier(n_estimators = 500)
# svm_clf = SVC()                   # for Hard voting
svm_clf = SVC(probability = True)   # for soft voting

In [19]:
voting_clf = VotingClassifier(
    estimators  = [('lr', lg_clf), ('rf', rnd_clf), ('svm', svm_clf)],
#     voting = 'hard'
    voting = 'soft'
    
)

In [20]:
voting_clf.fit(X_train, y_train)




VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',...
                                        

In [21]:
from sklearn.metrics import accuracy_score

clf_models = [lg_clf, rnd_clf, svm_clf, voting_clf]

for clf in clf_models:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))



LogisticRegression 0.812
RandomForestClassifier 0.871
SVC 0.862




VotingClassifier 0.8643333333333333


In [22]:
t = [[0.0, 0.0, 1.0, 619, 0, 32, 4, 175406.13, 2, 1, 1, 172792.43]]
print(voting_clf.predict_proba(t))

[[0.24558953 0.75441047]]


# Feature importance

In [23]:
rnd_clf.fit(X, y)
dct = {}
for name, score in zip(data[names], rnd_clf.feature_importances_):
    dct[name] = score
    


In [24]:
dct = sorted(dct.items(), key = lambda x:x[1] , reverse = True )
dct

[('Balance', 0.23886207856042824),
 ('Age', 0.14373192276360008),
 ('HasCrCard', 0.14256878387757402),
 ('IsActiveMember', 0.13001640894558514),
 ('NumOfProducts', 0.08262305645730857),
 ('Geography', 0.020521466716776033),
 ('Tenure', 0.018492897779050336),
 ('EstimatedSalary', 0.018245399303933065),
 ('CreditScore', 0.01021138940309596),
 ('Gender', 0.008917204770716352)]