# Ensemble/Voting Classification in Python with Scikit-Learn
ref：https://www.kaggle.com/c/titanic/submit

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier

In [4]:
training_data = pd.read_csv('/content/train.csv',index_col=0)
testing_data = pd.read_csv('/content/test.csv',index_col=0)
def get_nulls(training, testing):
    print("Training Data:")
    print(pd.isnull(training).sum())
    print("Testing Data:")
    print(pd.isnull(testing).sum())

get_nulls(training_data, testing_data)

Training Data:
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64
Testing Data:
Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64


In [5]:
# Drop the cabin column, as there are too many missing values
# Drop the ticket numbers too, as there are too many categories
# Drop names as they won't really help predict survivors
training_data.drop('Cabin', axis=1, inplace=True)
training_data.drop('Ticket', axis=1, inplace=True)
training_data.drop('Name', axis=1, inplace=True)


# Taking the mean/average value would be impacted by the skew
# so we should use the median value to impute missing values
training_data["Age"].fillna(training_data["Age"].median(),inplace=True)


get_nulls(training_data, testing_data)

Training Data:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64
Testing Data:
Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64


In [6]:
# Fit the encoder on the data (Feature: Sex)
encoder_1 = LabelEncoder()
encoder_1.fit(training_data["Sex"])

# Transform and replace training data
training_sex_encoded = encoder_1.transform(training_data["Sex"])
training_data["Sex"] = training_sex_encoded
test_sex_encoded = encoder_1.transform(testing_data["Sex"])
testing_data["Sex"] = test_sex_encoded

# Fit the encoder on the data (Feature: Embarked)


# Any value we want to reshape needs be turned into array first
ages_train = np.array(training_data["Age"]).reshape(-1, 1)


# Scaler takes arrays
scaler = StandardScaler()

training_data["Age"] = scaler.fit_transform(ages_train)


In [9]:
# Now to select our training/testing data
X_features = training_data.drop(labels=['Survived'], axis=1)
y_labels = training_data['Survived']

print(X_features.head(5))
print(y_labels.head(5))

# Make the train/test data for validation
X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1, random_state=12)

             Pclass  Sex       Age  SibSp  Parch     Fare Embarked
PassengerId                                                       
1                 3    1 -0.565736      1      0   7.2500        S
2                 1    0  0.663861      1      0  71.2833        C
3                 3    0 -0.258337      0      0   7.9250        S
4                 1    0  0.433312      1      0  53.1000        S
5                 3    1  0.433312      0      0   8.0500        S
PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64


## Simple Averaging Approach

In [12]:
# Encode categorical columns 'Sex' using Label Encoding
label_encoder = LabelEncoder()
training_data['Sex'] = label_encoder.fit_transform(training_data['Sex'])

# One-hot encode the 'Embarked' column
training_data = pd.get_dummies(training_data, columns=['Embarked'], prefix=['Embarked'])

# Now to select our training/testing data
X_features = training_data.drop(labels=['Survived'], axis=1)  # Exclude 'Survived' column for features
y_labels = training_data['Survived']

# Make the train/test data for validation
X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.1, random_state=12)
# Initialize classifiers
LogReg_clf = LogisticRegression()
DTree_clf = DecisionTreeClassifier()
SVC_clf = SVC()

# Fit classifiers
LogReg_clf.fit(X_train, y_train)
DTree_clf.fit(X_train, y_train)
SVC_clf.fit(X_train, y_train)

# Predictions
LogReg_pred = LogReg_clf.predict(X_val)
DTree_pred = DTree_clf.predict(X_val)
SVC_pred = SVC_clf.predict(X_val)

# Averaged predictions (you can modify the averaging method as needed)
averaged_preds = (LogReg_pred + DTree_pred + SVC_pred) // 3

# Calculate accuracy
acc = accuracy_score(y_val, averaged_preds)
print(acc)

0.7222222222222222


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Bagging Classification Example

In [16]:
from sklearn.model_selection import KFold, cross_val_score

def bagging_ensemble(model, X_train, y_train, n_splits=20, random_state=12):
    k_folds = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    results = cross_val_score(model, X_train, y_train, cv=k_folds)
    mean_accuracy = results.mean()
    print("Mean Accuracy:", mean_accuracy)
    return mean_accuracy
mean_accuracy_logreg = bagging_ensemble(LogReg_clf, X_train, y_train)
mean_accuracy_dtree = bagging_ensemble(DTree_clf, X_train, y_train)
mean_accuracy_svc = bagging_ensemble(SVC_clf, X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean Accuracy: 0.7964634146341464
Mean Accuracy: 0.7876524390243903
Mean Accuracy: 0.6815548780487807


## Boosting Classification Example

In [17]:
k_folds = KFold(n_splits=20, random_state=12,shuffle=True)
num_estimators = [20, 40, 60, 80, 100]

for i in num_estimators:
    ada_boost = AdaBoostClassifier(n_estimators=i, random_state=12)
    results = cross_val_score(ada_boost, X_train, y_train, cv=k_folds)
    print("Results for {} estimators:".format(i))
    print(results.mean())

Results for 20 estimators:
0.8052134146341464
Results for 40 estimators:
0.8126524390243903
Results for 60 estimators:
0.8176524390243903
Results for 80 estimators:
0.8114024390243901
Results for 100 estimators:
0.8126524390243903


## voting\Stacking Classification Example

In [18]:
voting_clf = VotingClassifier(estimators=[('SVC', SVC_clf), ('DTree', DTree_clf), ('LogReg', LogReg_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_val)
acc = accuracy_score(y_val, preds)
l_loss = log_loss(y_val, preds)
f1 = f1_score(y_val, preds)

print("Accuracy is: " + str(acc))
print("Log Loss is: " + str(l_loss))
print("F1 Score is: " + str(f1))

Accuracy is: 0.7888888888888889
Log Loss is: 7.609215715480287
F1 Score is: 0.6885245901639345


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
