In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel("bank.xlsx")

In [2]:
df['target'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)
df.drop('y',axis=1,inplace=True)

In [3]:
df.isnull().mean().sort_values(ascending=False)*100

target       0.0
loan         0.0
job          0.0
marital      0.0
education    0.0
default      0.0
balance      0.0
housing      0.0
contact      0.0
poutcome     0.0
day          0.0
month        0.0
duration     0.0
campaign     0.0
pdays        0.0
previous     0.0
age          0.0
dtype: float64

In [4]:
# Data transformation
# Convert categorical values to numeric using label encoder
from sklearn import preprocessing
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

# Encoding the categorical variable
fit = df.select_dtypes(include=['object']).fillna('NA').apply(lambda x: d[x.name].fit_transform(x))

#Convert the categorical columns based on encoding
for i in list(d.keys()):
    df[i] = d[i].transform(df[i].fillna('NA'))

In [5]:
from sklearn.cross_validation import train_test_split

train, test = train_test_split(df, test_size = 0.4)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

features_train = train[train.columns.difference(['target'])]
label_train = train['target']
features_test = test[test.columns.difference(['target'])]
label_test = test['target']



In [7]:
from tpot import TPOTClassifier

In [8]:
tpot = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)

In [9]:
tpot.fit(features_train, label_train)

Optimization Progress:  33%|███▎      | 40/120 [00:35<00:40,  1.99pipeline/s]

Generation 1 - Current best internal CV score: 0.8952843499023224


Optimization Progress:  50%|█████     | 60/120 [00:57<00:54,  1.11pipeline/s]

Generation 2 - Current best internal CV score: 0.8952843499023224


Optimization Progress:  67%|██████▋   | 80/120 [01:14<00:38,  1.04pipeline/s]

Generation 3 - Current best internal CV score: 0.8978660191013675


Optimization Progress:  83%|████████▎ | 100/120 [01:35<00:16,  1.19pipeline/s]

Generation 4 - Current best internal CV score: 0.8978660191013675


                                                                              

Generation 5 - Current best internal CV score: 0.8978660191013675

Best pipeline: ExtraTreesClassifier(LogisticRegression(input_matrix, C=0.5, dual=True, penalty=l2), bootstrap=False, criterion=gini, max_features=0.8, min_samples_leaf=12, min_samples_split=4, n_estimators=100)


TPOTClassifier(config_dict={'sklearn.feature_selection.SelectFwe': {'score_func': {'sklearn.feature_selection.f_classif': None}, 'alpha': array([0.   , 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
       0.009, 0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017,
       0.018, 0.019, 0.02 , 0.021, 0....ge': ['ward', 'complete', 'average'], 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=20, periodic_checkpoint_folder=None,
        population_size=20, random_state=42, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [11]:
tpot.score(features_test, label_test)

0.8933112216694307

In [12]:
tpot.export('tpot_mnist_pipeline.py')

True