In [1]:
import argparse
import logging
import matplotlib.pyplot as plt
from multiprocess import Pool
import numpy as np
import pandas as pd
import random
import sys

from deap import base
from deap import creator
from deap import tools
from scipy.sparse import csr_matrix
#from scoop import futures
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Split my data in 3 set with same size: Train, Test, and Validation.
# Train and Validation are used in the fitness function, after all, the Test dataset is used to evaluate the best individual
features, labels = load_svmlight_file("./data/data")


x, X_test, y, y_test = train_test_split(features, labels, test_size=0.3333, random_state=42, stratify=labels)
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=0.5, train_size=0.5, random_state=42, stratify=y)

# Classifier instance
clf = DecisionTreeClassifier()

# Store fitness evolution
graph = []
# Store features
num_features = []

### Functions

def run_classifier(_X_train, _X_test, _y_test):
    """
    Execute the classifier
    """
    model = clf.fit(_X_train, y_train)
    predictions = clf.predict(_X_test) 
    score = clf.score(_X_test, _y_test)
    probas = clf.predict_proba(_X_test)

    return predictions, probas, score

def select_idx(individual):
    """
    Get the indexes with 1 (ones) from a individual - this means what features were selected
    """
    return np.array(individual).nonzero()[0].tolist()

def select_features(columns, dataset):
    """
    Filter the dataset only the features selected
    """
    df = pd.DataFrame(dataset.toarray())
    return csr_matrix(pd.DataFrame(df, columns=columns))

def evaluate(individual, _x_test, _y_test):
    """
    Evaluate the dataset with the feature set selected
    """
    columns = select_idx(individual)
    return run_classifier(select_features(columns, X_train), select_features(columns, _x_test), _y_test)

def evalOneMax(individual):
    """
    Fitness function
    """
    predictions, probas, score = evaluate(individual, X_cv, y_cv)
    return score,

def store_fitness(pop):
    logging.debug("--- Fitness ---")
    length = len(pop)
    # Gather all the fitnesses in one list and logging.debug the stats
    fits = [ind.fitness.values[0] for ind in pop]
            
    mean = sum(fits) / length
    sum2 = sum(x*x for x in fits)
    std = abs(sum2 / length - mean**2)**0.5
            
    logging.debug("  Min %s" % min(fits))
    logging.debug("  Max %s" % max(fits))
    logging.debug("  Avg %s" % mean)
    logging.debug("  Std %s" % std)
    
    graph.append(max(fits))

def store_features(pop):
    logging.debug("--- Features ---")
    length = len(pop)
    nf = [np.count_nonzero(ind) for ind in pop]
    
    mean = sum(nf) / length
    sum2 = sum(x*x for x in nf)
    std = abs(sum2 / length - mean**2)**0.5
            
    logging.debug("  Min %s" % min(nf))
    logging.debug("  Max %s" % max(nf))
    logging.debug("  Avg %s" % mean)
    logging.debug("  Std %s" % std)
    num_features.append(max(nf))

### DEAP CONFIGUTATION ###
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# Attribute generator
toolbox.register("attr_bool", random.randint, 0, 1)

# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, 132)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Operator registering
toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", tools.cxOnePoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# SCOOP
#toolbox.register("map", futures.map)
def main(POP, CXPB, MUTPB, DATFILE):
    random.seed(64)

    # Process Pool of 4 workers
    #pool = multiprocessing.Pool(processes=2)
    pool = Pool(processes=4)
    toolbox.register("map", pool.map)

    ## population size
    pop = toolbox.population(n=args.pop)

    ## Probabilities for Crossover, Mutation and number of generations (iterations)
    NGEN = 300

    logging.debug("Start of evolution")

    # Evaluate the entire population
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    
    logging.debug("  Evaluated %i individuals" % len(pop))

    # Begin the evolution
    for g in range(NGEN):
        logging.debug("\n-- Generation %i --" % g)

        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))

        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < args.cros:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < args.mut:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]

        #invalid_ind = [ind for ind in offspring]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
            #logging.debug("Ind: {} - Fit: {}".format(ind, fit))

        logging.debug("  Evaluated %i individuals" % len(invalid_ind))

        # The population is entirely replaced by the offspring
        pop[:] = offspring

        #store_fitness(pop)
        #store_features(pop)

    logging.debug("-- End of (successful) evolution --")

    best_ind = tools.selBest(pop, 1)[0]
    logging.debug("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))

    predictions, probas, score = evaluate(best_ind, X_test, y_test)

    logging.debug("Test score: {} - N. features selected: {}".format(score, np.count_nonzero(best_ind)))

    # line =plt.plot(graph)
    # plt.show()

    # line =plt.plot(num_features)
    # plt.show()

    with open(args.datfile, 'w') as f:
        f.write(str(score*100))

    pool.close()


In [2]:
from deap import gp
import argparse
import logging
import matplotlib.pyplot as plt
from multiprocess import Pool
import numpy as np
import pandas as pd
import random
import sys
import operator
import math


from deap import base
from deap import creator
from deap import tools
from scipy.sparse import csr_matrix
#from scoop import futures
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Classifier instance
clf = DecisionTreeClassifier()

# Score - Return the mean accuracy on the given test data and labels.
def run_classifier(_X_train, _X_test, _y_test):
    """
    Execute the classifier
    """
    model = clf.fit(_X_train, y_train)
    predictions = clf.predict(_X_test)
    score = clf.score(_X_test, _y_test)
    probas = clf.predict_proba(_X_test)

    return predictions, probas, score

def select_idx(individual):
    """
    Get the indexes with 1 (ones) from a individual - this means what features were selected
    """
    return np.array(individual).nonzero()[0].tolist()

def select_features(columns, dataset):
    """
    Filter the dataset only the features selected
    """
    df = pd.DataFrame(dataset.to_numpy())
    return csr_matrix(pd.DataFrame(df, columns=columns))

def evaluate(individual, _x_test, _y_test):
    """
    Evaluate the dataset with the feature set selected
    """
    columns = select_idx(individual)
    return run_classifier(select_features(columns, X_train), select_features(columns, _x_test), _y_test)
    
# Define new functions
def protectedDiv(left, right):
    try:
        return left / right
    except ZeroDivisionError:
        return 1

# Create the tree
pset = gp.PrimitiveSet("MAIN", 1)
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protectedDiv, 2)
pset.addPrimitive(operator.neg, 1)
pset.addPrimitive(math.cos, 1)
pset.addPrimitive(math.sin, 1)
pset.addEphemeralConstant("rand101", lambda: random.randint(-1,1))
pset.renameArguments(ARG0='x')
pset.renameArguments(ARG1='y')

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

In [13]:
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

def evalSymbReg(individual, points):
    # Transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    # Evaluate the mean squared error between the expression
    # and the real function : x**4 + x**3 + x**2 + x
    sqerrors = ((func(x) - x**4 - x**3 - x**2 - x)**2 for x in points)
    return math.fsum(sqerrors) / len(points),

toolbox.register("evaluate", evalSymbReg, points=[x/10. for x in range(-10,10)])
toolbox.register("select", tools.selAutomaticEpsilonLexicase)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

In [17]:
x

NameError: name 'x' is not defined

In [11]:
y

NameError: name 'y' is not defined

In [7]:
X_test = x[10:]
y_test = y[10:]

In [8]:
X_test

array([[0.12095989, 0.37624443],
       [0.8982473 , 0.80097338],
       [0.33282662, 0.98573347],
       [0.69277377, 0.52971678],
       [0.55695667, 0.27353476],
       [0.6236606 , 0.98057015],
       [0.07736617, 0.59434975],
       [0.28821496, 0.94882164],
       [0.31286853, 0.13628459],
       [0.35644116, 0.59712233]])

In [9]:
y_test

array([0.20021821, 0.49031142, 0.5261494 , 0.33413577, 0.19246305,
       0.55265114, 0.30491149, 0.50323232, 0.09942915, 0.33420528])

In [11]:
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=0.5, train_size=0.5, random_state=42
    )

In [12]:
X_train

array([[0.69277377, 0.52971678],
       [0.93530195, 0.90723356],
       [0.0249123 , 0.76775584],
       [0.35644116, 0.59712233],
       [0.89912945, 0.16464876],
       [0.33282662, 0.98573347],
       [0.4593773 , 0.69866791],
       [0.12095989, 0.37624443],
       [0.55695667, 0.27353476],
       [0.86619883, 0.48250919]])

In [13]:
X_cv

array([[0.47276183, 0.81994288],
       [0.28821496, 0.94882164],
       [0.6236606 , 0.98057015],
       [0.25162136, 0.66753129],
       [0.33454912, 0.91309349],
       [0.72581005, 0.67395144],
       [0.8982473 , 0.80097338],
       [0.60600948, 0.0762569 ],
       [0.31286853, 0.13628459],
       [0.07736617, 0.59434975]])

In [14]:
 y_train

array([0.33413577, 0.54714697, 0.38636915, 0.33420528, 0.17223733,
       0.5261494 , 0.39527168, 0.20021821, 0.19246305, 0.32787448])

In [15]:
y_cv

array([0.45724762, 0.50323232, 0.55265114, 0.35892778, 0.49000165,
       0.40955672, 0.49031142, 0.0987294 , 0.09942915, 0.30491149])

In [16]:
# This creates a dataset from https://newbedev.com/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test
    
np.random.seed([3,1415])
df = pd.DataFrame(np.random.rand(10, 5), columns=list('ABCDE'))
train, validate, test = train_validate_test_split(df)

x, X_test, y, y_test = train_test_split(xtrain,labels,test_size=0.2,train_size=0.8)
X_train, X_cv, y_train, y_cv = train_test_split(x,y,test_size = 0.25,train_size =0.75)

NameError: name 'xtrain' is not defined

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sb

from pmlb import fetch_data, classification_dataset_names, regression_dataset_names

logit_test_scores = []
gnb_test_scores = []

print(classification_dataset_names)
print('')
print(regression_dataset_names)

['GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1', 'GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1', 'GAMETES_Epistasis_3_Way_20atts_0.2H_EDM_1_1', 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM_2_001', 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM_2_001', 'Hill_Valley_with_noise', 'Hill_Valley_without_noise', 'adult', 'agaricus_lepiota', 'allbp', 'allhyper', 'allhypo', 'allrep', 'analcatdata_aids', 'analcatdata_asbestos', 'analcatdata_authorship', 'analcatdata_bankruptcy', 'analcatdata_boxing1', 'analcatdata_boxing2', 'analcatdata_creditscore', 'analcatdata_cyyoung8092', 'analcatdata_cyyoung9302', 'analcatdata_dmft', 'analcatdata_fraud', 'analcatdata_germangss', 'analcatdata_happiness', 'analcatdata_japansolvent', 'analcatdata_lawsuit', 'ann_thyroid', 'appendicitis', 'australian', 'auto', 'backache', 'balance_scale', 'biomed', 'breast', 'breast_cancer', 'breast_cancer_wisconsin', 'breast_w', 'buggyCrx', 'bupa', 'calendarD

In [30]:
x, y = fetch_data('207_autoPrice', return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(x, y)
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=0.5, train_size=0.5,random_state=42)

In [40]:
var = fetch_data('644_fri_c4_250_25')
print(var.describe())

                oz1           oz2           oz3           oz4           oz5  \
count  2.500000e+02  2.500000e+02  2.500000e+02  2.500000e+02  2.500000e+02   
mean   3.645197e-09  2.831221e-10 -5.451497e-09 -1.735985e-09 -1.040287e-09   
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00   
min   -2.357685e+00 -1.681898e+00 -1.970431e+00 -1.630881e+00 -1.641150e+00   
25%   -7.788459e-01 -8.424280e-01 -7.479277e-01 -8.418980e-01 -7.449293e-01   
50%   -3.805557e-02  4.388335e-02 -1.382825e-01 -9.828104e-03 -1.903547e-01   
75%    8.103617e-01  8.395882e-01  5.983409e-01  9.393843e-01  4.415072e-01   
max    2.136381e+00  1.752596e+00  3.165987e+00  1.717576e+00  4.297223e+00   

                oz6           oz7           oz8           oz9          oz10  \
count  2.500000e+02  2.500000e+02  2.500000e+02  2.500000e+02  2.500000e+02   
mean  -4.768372e-10 -2.256595e-09 -2.207235e-10 -6.197952e-10 -1.158565e-09   
std    1.000000e+00  1.000000e+00  1.000000e+00  1.

In [34]:
X_train

array([[0.00000000e+00, 1.15000000e+02, 9.88000031e+01, ...,
        4.80000000e+03, 2.60000000e+01, 3.20000000e+01],
       [2.00000000e+00, 9.40000000e+01, 9.73000031e+01, ...,
        4.80000000e+03, 3.70000000e+01, 4.60000000e+01],
       [0.00000000e+00, 1.02000000e+02, 9.70000000e+01, ...,
        4.80000000e+03, 2.40000000e+01, 2.90000000e+01],
       ...,
       [1.00000000e+00, 1.48000000e+02, 9.36999969e+01, ...,
        5.50000000e+03, 3.10000000e+01, 3.80000000e+01],
       [2.00000000e+00, 1.04000000e+02, 9.90999985e+01, ...,
        5.25000000e+03, 2.10000000e+01, 2.80000000e+01],
       [0.00000000e+00, 1.02000000e+02, 9.71999969e+01, ...,
        5.20000000e+03, 2.60000000e+01, 3.20000000e+01]])

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sb

from pmlb import fetch_data, classification_dataset_names

logit_test_scores = []
gnb_test_scores = []

min_max_scaler = preprocessing.MinMaxScaler()

for classification_dataset in classification_dataset_names:
    X, y = fetch_data(classification_dataset, return_X_y=True)
    train_X, test_X, train_y, test_y = min_max_scaler.fit_transform(train_test_split(X, y))

    logit = LogisticRegression()
    gnb = GaussianNB()

    logit.fit(train_X, train_y)
    gnb.fit(train_X, train_y)

    logit_test_scores.append(logit.score(test_X, test_y))
    gnb_test_scores.append(gnb.score(test_X, test_y))

sb.boxplot(data=[logit_test_scores, gnb_test_scores], notch=True)
plt.xticks([0, 1], ['LogisticRegression', 'GaussianNB'])
plt.ylabel('Test Accuracy')

ValueError: setting an array element with a sequence.