# Power of Model Ensembling

Ensemble model combines multiple 'individual' (diverse) models together and delivers superior prediction power. ... Basically, an ensemble is a supervised learning technique for combining multiple weak learners/ models to produce a strong learner. Ensemble model works better, when we ensemble models with low correlation.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, GradientBoostingRegressor)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics, linear_model, naive_bayes
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder 
from scipy import sparse
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from random import *

## DATA préalablement nettoyée

In [2]:
#Chargement du fichier DataClean.csv dans le dataframe df
df = pd.read_csv("DataClean.csv")

In [3]:
#Selection des 20 premier variables : Random Forest Result
SelectedFeatures=['var38',
 'ind_var30',
 'var15',
 'saldo_medio_var5_ult1',
 'saldo_var30',
 'saldo_medio_var5_ult3',
 'num_meses_var5_ult3',
 'num_var30',
 'num_var35',
 'imp_op_var41_efect_ult1',
 'var36',
 'num_var22_ult3',
 'num_var22_ult1',
 'imp_op_var41_efect_ult3',
 'num_var45_hace3',
 'saldo_var5',
 'num_var22_hace2',
 'imp_op_var41_ult1',
 'num_med_var45_ult3',
 'num_var4',
 'TARGET']

In [4]:
df=df.copy()[SelectedFeatures]

df.to_csv("df.csv")

In [5]:
df.head()

Unnamed: 0,var38,ind_var30,var15,saldo_medio_var5_ult1,saldo_var30,saldo_medio_var5_ult3,num_meses_var5_ult3,num_var30,num_var35,imp_op_var41_efect_ult1,...,num_var22_ult3,num_var22_ult1,imp_op_var41_efect_ult3,num_var45_hace3,saldo_var5,num_var22_hace2,imp_op_var41_ult1,num_med_var45_ult3,num_var4,TARGET
0,39205.17,0,23,0.0,0.0,0.0,0,0,0,0.0,...,0,0,0.0,0,0.0,0,0.0,0,0,0
1,49278.03,1,34,0.0,300.0,0.0,1,3,3,0.0,...,0,0,0.0,0,0.0,0,0.0,0,1,0
2,67333.77,1,23,3.0,3.0,2.07,3,3,3,0.0,...,0,0,0.0,0,3.0,0,0.0,0,1,0
3,64007.97,1,37,91.56,70.62,138.84,2,3,9,0.0,...,3,0,0.0,3,70.62,3,195.0,15,3,0
4,117310.979016,1,39,40501.08,135003.0,13501.47,3,3,3,0.0,...,9,6,0.0,0,0.0,3,0.0,0,1,0


## Echantillonnage de données

In [6]:
#Y, X, X_train, X_test, Y_train, Y_test = ""

def splitData(df):
    Y = df.TARGET
    X = df.drop(['TARGET'], axis=1)

    # diviser X et Y en training and testing
    from sklearn.model_selection import train_test_split
    return  train_test_split(X, Y, test_size=0.25,random_state=1)

In [7]:
X_train, X_test, Y_train, Y_test = splitData(df)

X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
Y_train.to_csv("Y_train.csv")
Y_test.to_csv("Y_test.csv")

poids = []
O = 0.5/sum(Y_train==1)
Z = 0.5/sum(Y_train==0)
for i in Y_train:
    if i==0:
        poids.append(Z)
    else :
        poids.append(O)

poids = np.array(poids)

<h5>Base Estimators :</h5>
<ul> 
    <li><h9>DecisionTreeClassifier x100</h9></li>
    <li><h9>RandomForestClassifier x100</h9></li>
    <li><h9>LogisticRegression x100</h9></li>
    <li><h9>BernoulliNB x100</h9></li>
    <li><h9>KNeighborsClassifier x100</h9></li>
    <li><h9>ExtraTreesClassifier x100</h9></li>
    <li><h9>SVC x1</h9></li>
    <li><h9>LinearSVC x100</h9></li>
    <li><h9>GradientBoostingClassifier x100</h9></li>
    <li><h9>SGDClassifier x100</h9></li>
    <li><h9>MLPClassifier x100</h9></li>
</ul>

<br/><br/><br/>
## Random Search

In [15]:
def RandomPredictSearch(header, clf, param_dic, n_iteration, metric='roc_auc', Xt=X_train, Yt=Y_train, Xv=X_test, n_iter_search=3):
    pred_df=pd.DataFrame()
    params=[]
    for t in xrange(n_iteration):
        # run randomized search
        random_search = RandomizedSearchCV(clf, param_distributions=param_dic, n_iter=n_iter_search, scoring=metric, n_jobs=-1)
        random_search.fit(Xt, Yt)
        #Prediction de X_test:
        pred_df[header+`t`]=random_search.predict_proba(Xv)[:,1]
        #Sauvegarder les parametres utiliser:
        params.append(random_search.best_params_)
    
    # enregistrement ...
    pred_df.to_csv("BaseEstimators/"+header+".csv")
    DumpObject('BaseEstimators/Object_Parametre.'+header, params)
    
    return pred_df, params

In [14]:
RandomizedSearchCV.best_params_

In [9]:
def getParamRandom(param_list, param_names):
    dictionnaire = {}
    for i in xrange(len(param_names)):
      dictionnaire[param_names[i]]  = sample(param_list[i],  1)[0] 
    return dictionnaire

In [10]:
import pickle

def DumpObject(name, Object):
    # enregistrement ...
    with open(name, 'wb') as fichier:
        mon_pickler = pickle.Pickler(fichier)
        mon_pickler.dump(Object)

def LoadObject(name):
    # Lecture des objets contenus dans le fichier...
    with open(name, 'rb') as fichier:
        mon_depickler = pickle.Unpickler(fichier)
        object_recupere = mon_depickler.load()
    return object_recupere

def perdictProb(header, clf, param_list, param_names, n_iter_search, X, Y, Xv, metric='roc_auc'):
    pred_df=pd.DataFrame()
    df_header=[]
    
    for t in xrange(n_iter_search):
        newdict = getParamRandom(param_list,param_names)
        for p in newdict.keys():
            clf.__setattr__(p, newdict[p])
        clf.fit(X,Y)
        #####
        df_header.append(clf.get_params())
        #####
        pred_df[header+`t`]=clf.predict_proba(Xv)[:,1]
    
    # enregistrement ...
    pred_df.to_csv(header+".csv")
    DumpObject('Object_Parametre.'+header, df_header)
    
    return pred_df, df_header

## DecisionTreeClassifier x100

In [18]:
params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14, None],
              "max_features": sp_randint(1, 20),
              "min_samples_split": sp_randint(2, 20),
              "min_samples_leaf": sp_randint(1, 20),
              "criterion": ["gini", "entropy"],
              "n_estimators" : range(50,999,69),
              "random_state" : range(1,100,7)}

clf_DTC = DecisionTreeClassifier()

In [23]:
clf_DTC.get_params().keys()

['presort',
 'splitter',
 'max_leaf_nodes',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'max_depth',
 'class_weight']

In [24]:
%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    169     pkg_name = mod_name.rpartition('.')[0]
    170     main_globals = sys.modules["__main__"].__dict__
    171     if alter_argv:
    172         sys.argv[0] = fname
    173     return _run_code(code, main_globals, None,
--> 174                      "__main__", fname, loader, pkg_name)
        fname = '/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    175 
    176 def run_module(mod_name, init_globals=None,
    177                run_name=None, alter_sys=False):
    178     """Execute a module's code without importing it

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x1007ce2b0, file "/Use...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Users/salah...python2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x1007ce2b0, file "/Use...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/Users/salah...python2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    584         
    585         If a global instance already exists, this reinitializes and starts it
    586         """
    587         app = cls.instance(**kwargs)
    588         app.initialize(argv)
--> 589         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    590 
    591 #-----------------------------------------------------------------------------
    592 # utility functions, for convenience
    593 #-----------------------------------------------------------------------------

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    437         
    438         if self.poller is not None:
    439             self.poller.start()
    440         self.kernel.start()
    441         try:
--> 442             ioloop.IOLoop.instance().start()
    443         except KeyboardInterrupt:
    444             pass
    445 
    446 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-12-08T01:12:10.307067', u'msg_id': u'5A656E114FF1475AA51BDCF0C4C73161', u'msg_type': u'execute_request', u'session': u'4EE6CE8B40A94ED1B27B47A6F2D194EB', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'5A656E114FF1475AA51BDCF0C4C73161', 'msg_type': u'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['4EE6CE8B40A94ED1B27B47A6F2D194EB']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u'%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-12-08T01:12:10.307067', u'msg_id': u'5A656E114FF1475AA51BDCF0C4C73161', u'msg_type': u'execute_request', u'session': u'4EE6CE8B40A94ED1B27B47A6F2D194EB', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'5A656E114FF1475AA51BDCF0C4C73161', 'msg_type': u'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['4EE6CE8B40A94ED1B27B47A6F2D194EB'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-12-08T01:12:10.307067', u'msg_id': u'5A656E114FF1475AA51BDCF0C4C73161', u'msg_type': u'execute_request', u'session': u'4EE6CE8B40A94ED1B27B47A6F2D194EB', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'5A656E114FF1475AA51BDCF0C4C73161', 'msg_type': u'execute_request', 'parent_header': {}})
    386         if not silent:
    387             self.execution_count += 1
    388             self._publish_execute_input(code, parent, self.execution_count)
    389 
    390         reply_content = self.do_execute(code, silent, store_history,
--> 391                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    392 
    393         # Flush output before sending the reply.
    394         sys.stdout.flush()
    395         sys.stderr.flush()

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u'%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    194 
    195         reply_content = {}
    196         # FIXME: the shell calls the exception handler itself.
    197         shell._reply_content = None
    198         try:
--> 199             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u'%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)'
        store_history = True
        silent = False
    200         except:
    201             status = u'error'
    202             # FIXME: this code right now isn't being used yet by default,
    203             # because the run_cell() call above directly fires off exception

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u'%time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', store_history=True, silent=False, shell_futures=True)
   2718                 self.displayhook.exec_result = result
   2719 
   2720                 # Execute the user code
   2721                 interactivity = "none" if silent else self.ast_node_interactivity
   2722                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2723                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2724 
   2725                 # Reset this so later displayed values do not modify the
   2726                 # ExecutionResult
   2727                 self.displayhook.exec_result = None

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>], cell_name='<ipython-input-24-36cd85048802>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2826                     return True
   2827 
   2828             for i, node in enumerate(to_run_interactive):
   2829                 mod = ast.Interactive([node])
   2830                 code = compiler(mod, cell_name, "single")
-> 2831                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x113e206b0, file "<ipython-input-24-36cd85048802>", line 1>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2832                     return True
   2833 
   2834             # Flush softspace
   2835             if softspace(sys.stdout, 0):

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x113e206b0, file "<ipython-input-24-36cd85048802>", line 1>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2880         outflag = 1  # happens in more places, so it's easier as default
   2881         try:
   2882             try:
   2883                 self.hooks.pre_run_code_hook()
   2884                 #rprint('Running code', repr(code_obj)) # dbg
-> 2885                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x113e206b0, file "<ipython-input-24-36cd85048802>", line 1>
        self.user_global_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...}
        self.user_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...}
   2886             finally:
   2887                 # Reset our crash handler in place
   2888                 sys.excepthook = old_excepthook
   2889         except SystemExit as e:

...........................................................................
/Users/salaheddine/Desktop/S3/Etude_de_cas/Projet2_Santander_Customer_Satisfaction/<ipython-input-24-36cd85048802> in <module>()
----> 1 
      2 
      3 
      4 
      5 
      6 get_ipython().magic(u'time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)')
      7 
      8 
      9 
     10 

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in magic(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, arg_s=u'time preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)')
   2158         compound statements.
   2159         """
   2160         # TODO: should we issue a loud deprecation warning here?
   2161         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2162         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163         return self.run_line_magic(magic_name, magic_arg_s)
        self.run_line_magic = <bound method ZMQInteractiveShell.run_line_magic of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        magic_name = u'time'
        magic_arg_s = u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)'
   2164 
   2165     #-------------------------------------------------------------------------
   2166     # Things related to macros
   2167     #-------------------------------------------------------------------------

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_line_magic(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, magic_name=u'time', line=u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)')
   2079             kwargs = {}
   2080             # Grab local namespace if we need it:
   2081             if getattr(fn, "needs_local_scope", False):
   2082                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2083             with self.builtin_trap:
-> 2084                 result = fn(*args,**kwargs)
        result = undefined
        fn = <bound method ExecutionMagics.time of <IPython.core.magics.execution.ExecutionMagics object>>
        args = [u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)']
        kwargs = {'local_ns': {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...}}
   2085             return result
   2086 
   2087     def run_cell_magic(self, magic_name, line, cell):
   2088         """Execute the given cell magic.

...........................................................................
/Users/salaheddine/Desktop/S3/Etude_de_cas/Projet2_Santander_Customer_Satisfaction/<decorator-gen-60> in time(self=<IPython.core.magics.execution.ExecutionMagics object>, line=u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', cell=None, local_ns={'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...})
      1 
----> 2 
      3 
      4 
      5 
      6 
      7 
      8 
      9 
     10 

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/magic.py in <lambda>(f=<function time>, *a=(<IPython.core.magics.execution.ExecutionMagics object>, u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', None, {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...}), **k={})
    188     validate_type(magic_kind)
    189 
    190     # This is a closure to capture the magic_kind.  We could also use a class,
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
        f = <function time>
        a = (<IPython.core.magics.execution.ExecutionMagics object>, u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', None, {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...})
        k = {}
    194 
    195         if callable(arg):
    196             # "Naked" decorator call (just @foo, no args)
    197             func = arg

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.py in time(self=<IPython.core.magics.execution.ExecutionMagics object>, line=u'preds_DTC, params_DTC = RandomPredictSearch("DTC", clf_DTC, params_DTC, 1)', cell=None, local_ns={'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...})
   1172             st = clock2()
   1173             out = eval(code, glob, local_ns)
   1174             end = clock2()
   1175         else:
   1176             st = clock2()
-> 1177             exec(code, glob, local_ns)
        code = <code object <module> at 0x113e20230, file "<timed exec>", line 1>
        glob = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...}
        local_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'DumpObject': <function DumpObject>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GradientBoostingRegressor': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u"import numpy as np\nimport pandas as pd\nfrom ...port randint as sp_randint\nfrom random import *", u'#Chargement du fichier DataClean.csv dans le dataframe df\ndf = pd.read_csv("DataClean.csv")', u"#Selection des 20 premier variables : Random F... 'num_med_var45_ult3',\n 'num_var4',\n 'TARGET']", u'df=df.copy()[SelectedFeatures]', u'df.head()', u'#Y, X, X_train, X_test, Y_train, Y_test = ""\n..._test_split(X, Y, test_size=0.25,random_state=1)', u'X_train, X_test, Y_train, Y_test = splitData(df)', u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'def getParamRandom(param_list, param_names):\n...(param_list[i],  1)[0] \n    return dictionnaire', u"import pickle\n\ndef DumpObject(name, Object):...mon_depickler.load()\n    return object_recupere", u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_DTC = perdictProb("DTC", clf_DTC, params_DTC, 1)\')', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u"get_ipython().magic(u'pinfo RandomizedSearchCV')", u'def RandomPredictSearch(header, clf, param_dic...eader, params)\n    \n    return pred_df, params', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', u'params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,...(1,100,7)}\n\nclf_DTC = DecisionTreeClassifier()', u'get_ipython().magic(u\'time preds_DTC, params_...mPredictSearch("DTC", clf_DTC, params_DTC, 1)\')', ...], 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearRegression': <class 'sklearn.linear_model.base.LinearRegression'>, ...}
   1178             end = clock2()
   1179             out = None
   1180         wall_end = wtime()
   1181         # Compute actual times and report

...........................................................................
/Users/salaheddine/Desktop/S3/Etude_de_cas/Projet2_Santander_Customer_Satisfaction/<timed exec> in <module>()
----> 1 
      2 
      3 
      4 
      5 
      6 
      7 
      8 
      9 
     10 

...........................................................................
/Users/salaheddine/Desktop/S3/Etude_de_cas/Projet2_Santander_Customer_Satisfaction/<ipython-input-15-b025348c3c6d> in RandomPredictSearch(header='DTC', clf=DecisionTreeClassifier(class_weight=None, criter...resort=False, random_state=None, splitter='best'), param_dic={'criterion': ['gini', 'entropy'], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, None], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object>, 'n_estimators': [50, 119, 188, 257, 326, 395, 464, 533, 602, 671, 740, 809, 878, 947], 'random_state': [1, 8, 15, 22, 29, 36, 43, 50, 57, 64, 71, 78, 85, 92, 99]}, n_iteration=1, metric='roc_auc', Xt=               var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns], Yt=62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64, Xv=               var38  ind_var30  var15  saldo_me...         9         1  

[19005 rows x 20 columns], n_iter_search=3)
      2     pred_df=pd.DataFrame()
      3     params=[]
      4     for t in xrange(n_iteration):
      5         # run randomized search
      6         random_search = RandomizedSearchCV(clf, param_distributions=param_dic, n_iter=n_iter_search, scoring=metric, n_jobs=-1)
----> 7         random_search.fit(Xt, Yt)
      8         #Prediction de X_test:
      9         pred_df[header+`t`]=random_search.predict_proba(Xv)[:,1]
     10         #Sauvegarder les parametres utiliser:
     11         params.append(random_search.best_params_)

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_search.py in fit(self=RandomizedSearchCV(cv=None, error_score='raise',...n_train_score=True, scoring='roc_auc', verbose=0), X=               var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns], y=62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64, groups=None)
   1185             train/test set.
   1186         """
   1187         sampled_params = ParameterSampler(self.param_distributions,
   1188                                           self.n_iter,
   1189                                           random_state=self.random_state)
-> 1190         return self._fit(X, y, groups, sampled_params)
        self._fit = <bound method RandomizedSearchCV._fit of Randomi..._train_score=True, scoring='roc_auc', verbose=0)>
        X =                var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns]
        y = 62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64
        groups = None
        sampled_params = <sklearn.model_selection._search.ParameterSampler object>
   1191 
   1192 
   1193 
   1194 

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_search.py in _fit(self=RandomizedSearchCV(cv=None, error_score='raise',...n_train_score=True, scoring='roc_auc', verbose=0), X=               var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns], y=62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64, groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterSampler object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterSampler object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Thu Dec  8 01:12:10 2016
PID: 27640            Python 2.7.12: /Users/salaheddine/anaconda/bin/python
...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (DecisionTreeClassifier(class_weight=None, criter...resort=False, random_state=None, splitter='best'),                var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns], 62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64, make_scorer(roc_auc_score, needs_threshold=True), array([18983, 18984, 18985, ..., 57012, 57013, 57014]), array([    0,     1,     2, ..., 19415, 19490, 19495]), 0, {'criterion': 'entropy', 'max_depth': 13, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 13, 'n_estimators': 395, 'random_state': 36})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
        self.items = [(<function _fit_and_score>, (DecisionTreeClassifier(class_weight=None, criter...resort=False, random_state=None, splitter='best'),                var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns], 62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64, make_scorer(roc_auc_score, needs_threshold=True), array([18983, 18984, 18985, ..., 57012, 57013, 57014]), array([    0,     1,     2, ..., 19415, 19490, 19495]), 0, {'criterion': 'entropy', 'max_depth': 13, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 13, 'n_estimators': 395, 'random_state': 36}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=DecisionTreeClassifier(class_weight=None, criter...resort=False, random_state=None, splitter='best'), X=               var38  ind_var30  var15  saldo_me...         6         2  

[57015 rows x 20 columns], y=62211    0
18312    0
71044    0
38794    0
2029...
50057    0
5192     0
Name: TARGET, dtype: int64, scorer=make_scorer(roc_auc_score, needs_threshold=True), train=array([18983, 18984, 18985, ..., 57012, 57013, 57014]), test=array([    0,     1,     2, ..., 19415, 19490, 19495]), verbose=0, parameters={'criterion': 'entropy', 'max_depth': 13, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 13, 'n_estimators': 395, 'random_state': 36}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    222     fit_params = fit_params if fit_params is not None else {}
    223     fit_params = dict([(k, _index_param_value(X, v, train))
    224                       for k, v in fit_params.items()])
    225 
    226     if parameters is not None:
--> 227         estimator.set_params(**parameters)
        estimator.set_params = <bound method DecisionTreeClassifier.set_params ...esort=False, random_state=None, splitter='best')>
        parameters = {'criterion': 'entropy', 'max_depth': 13, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 13, 'n_estimators': 395, 'random_state': 36}
    228 
    229     start_time = time.time()
    230 
    231     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/Users/salaheddine/anaconda/lib/python2.7/site-packages/sklearn/base.py in set_params(self=DecisionTreeClassifier(class_weight=None, criter...resort=False, random_state=None, splitter='best'), **params={'criterion': 'entropy', 'max_depth': 13, 'max_features': 5, 'min_samples_leaf': 4, 'min_samples_split': 13, 'n_estimators': 395, 'random_state': 36})
    286                 # simple objects case
    287                 if key not in valid_params:
    288                     raise ValueError('Invalid parameter %s for estimator %s. '
    289                                      'Check the list of available parameters '
    290                                      'with `estimator.get_params().keys()`.' %
--> 291                                      (key, self.__class__.__name__))
        key = 'n_estimators'
        self.__class__.__name__ = 'DecisionTreeClassifier'
    292                 setattr(self, key, value)
    293         return self
    294 
    295     def __repr__(self):

ValueError: Invalid parameter n_estimators for estimator DecisionTreeClassifier. Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________

In [None]:
params_DTC = {"max_depth": [1,2,3,4,5,6,7,8,9,10,11,12,13,14, None],
              "max_features": sp_randint(1, 20),
              "min_samples_split": sp_randint(2, 20),
              "min_samples_leaf": sp_randint(1, 20),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators" : range(50,999,69),
              "random_state" : range(1,100,7)}


## RandomForestClassifier  x100

In [16]:
max_depth_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14, None]
max_features_list = range(1,21)
min_samples_split_list = range(2,21)
min_samples_leaf_list = range(1,21)
bootstrap_list = [True, False]
criterion_list = ["gini", "entropy"]
######
param_list_RFC = [max_depth_list, max_features_list, min_samples_split_list, min_samples_leaf_list, bootstrap_list, criterion_list]
param_names_RFC = ['max_depth', 'max_features', 'min_samples_split', 'min_samples_leaf', 'bootstrap', 'criterion' ]

In [17]:
clf_RFC = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

In [355]:
%time preds_RFC, params_RFC = perdictProb("RFC", clf_RFC, param_list_RFC, param_names_RFC, 100, X_train, Y_train, X_test)

CPU times: user 40min 13s, sys: 26.4 s, total: 40min 40s
Wall time: 12min 40s


## LogisticRegression x100

In [18]:
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=1)

In [19]:
#dual_list =[False, True]
tol_list =[0.00004,0.00002,0.00001,0.0001,0.0002,0.0004,0.0009,0.001,0.0014,0.002,0.01]
C_list =[0.9,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8]
fit_intercept_list =[False, True]
class_weight_list =[None, 'balanced']
#solver_list =['liblinear','newton-cg', 'lbfgs', 'sag']
max_iter_list =[50,90,100,120,160,200]
#multi_class_list =['ovr', 'multinomial']
warm_start_list =[False, True]
intercept_scaling_list=[0.9, 1.0, 1.11, 1.6, 2.0, 2.11]

######
param_list_LR = [tol_list, C_list, fit_intercept_list, class_weight_list, 
               max_iter_list, warm_start_list, intercept_scaling_list]
param_names_LR = ["tol", "C", "fit_intercept", "class_weight", 
                "max_iter", "warm_start", "intercept_scaling"]

In [409]:
%time preds_LR, params_LR = perdictProb("LR", clf_LR, param_list_LR, param_names_LR, 100, X_train, Y_train, X_test)

CPU times: user 3min 31s, sys: 1.91 s, total: 3min 33s
Wall time: 3min 6s


In [281]:
np.arange(0.00001,0.0001,0.00001,dtype=float)

array([  1.00000000e-05,   2.00000000e-05,   3.00000000e-05,
         4.00000000e-05,   5.00000000e-05,   6.00000000e-05,
         7.00000000e-05,   8.00000000e-05,   9.00000000e-05])

## Naive Bayes x100

In [20]:
alpha_list=[0.061, 0.05, 0.041, 0.04, 0.032, 0.02, 0.021, 1.0, 1.3, 1.6, 1.4, 1.8, 1.9, 2.0]
binarize_list=[2.0, 1.9, 0.6, 1.4, 1.2, 1.0, 0.5, 0.7, 0.0]
fit_prior_list=[False, True]

######
param_list_NB = [alpha_list, binarize_list, fit_prior_list]
param_names_NB = ["alpha", "binarize", "fit_prior"]

In [21]:
clf_NB = naive_bayes.BernoulliNB()

In [473]:
%time preds_NB, params_NB = perdictProb("NB", clf_NB, param_list_NB, param_names_NB, 100, X_train, Y_train, X_test)

CPU times: user 14.2 s, sys: 1.17 s, total: 15.3 s
Wall time: 10.8 s


In [474]:
#roc_auc_score(Y_test, preds_NB)

## KNN x100

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
n_neighbors_list=range(3,9)
weights_list=['uniform', 'distance']
#algorithm_list=['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size_list=[20,30,40]

######
param_list_KNC = [n_neighbors_list, weights_list, leaf_size_list]
param_names_KNC = ["n_neighbors", "weights", "leaf_size"]

In [24]:
df_header=[]
def perdictProbPerIter(header, clf, param_list, param_names, n_iter_search, X, Y, Xv, metric='roc_auc'):
    for t in xrange(n_iter_search):
        newdict = getParamRandom(param_list,param_names)
        for p in newdict.keys():
            clf.__setattr__(p, newdict[p])
        clf.fit(X,Y)
        #####
        df_header.append(clf.get_params())
        #####
        # enregistrement de l'iteration t...
        DumpObject("PerdictProbPerIter/df."+header+`t`, clf.predict_proba(Xv)[:,1])
    
    
     # enregistrement
    #DumpObject('Object_Parametre.'+header, df_header)
    
    #return df_header

In [25]:
clf_KNC = KNeighborsClassifier(n_jobs=-1)

In [121]:
%time preds_KNC, params_KNC = perdictProb("KNC", clf_KNC, param_list_KNC, param_names_KNC, 100, X_train, Y_train, X_test)

CPU times: user 4min 11s, sys: 1.95 s, total: 4min 13s
Wall time: 1min 50s


See also
--------
RadiusNeighborsClassifier
KNeighborsRegressor
RadiusNeighborsRegressor
NearestNeighbors


## ExtraTreesClassifier x100

In [26]:
max_depth_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14, None]
max_features_list = range(1,21)
min_samples_split_list = range(2,21)
min_samples_leaf_list = range(1,21)
bootstrap_list = [True, False]
criterion_list = ["gini", "entropy"]
n_estimators_list =[50, 60, 70, 80, 90, 100]
######
param_list_XT = [max_depth_list, max_features_list, min_samples_split_list, min_samples_leaf_list, bootstrap_list, criterion_list, n_estimators_list]
param_names_XT = ['max_depth', 'max_features', 'min_samples_split', 'min_samples_leaf', 'bootstrap', 'criterion', 'n_estimators' ]

In [27]:
clf_XT =ExtraTreesClassifier(n_jobs=-1, random_state=5)

In [71]:
%time preds_XT, params_XT = perdictProb("XT", clf_XT, param_list_XT, param_names_XT, 100, X_train, Y_train, X_test)

CPU times: user 8min 21s, sys: 17.1 s, total: 8min 38s
Wall time: 3min 27s


## SVM

In [28]:
from sklearn.svm import LinearSVC, SVC

In [29]:
C_list = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]
gamma_list = ['auto', 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5]
kernel_list=['rbf', 'linear', 'poly', 'sigmoid', 'precomputed']
degree_list=range(3,7)
coef0_list=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
shrinking_list=[False, True]
tol_list=[0.001, 0.0001,0.0002,0.0004,0.0009,0.001,0.0014,0.002,0.01, 0.1]
class_weight_list=[None, 'balanced']
verbose_list=[False, True]
random_state_list =range(0,101,7)
######
param_list_SVC = [random_state_list, C_list, gamma_list, kernel_list, degree_list, coef0_list, shrinking_list, tol_list, class_weight_list, verbose_list]
param_names_SVC = ["random_state","C", "gamma", "kernel", "degree", "coef0", "shrinking", "tol", "class_weight", "verbose"]

In [30]:
clf_SVC = SVC(probability=True, random_state=1)

In [163]:
%time preds_SVC, params_SVC = perdictProb("SVC", clf_SVC, param_list_SVC, param_names_SVC, 3, X_train, Y_train, X_test)

CPU times: user 1h 32min 53s, sys: 41 s, total: 1h 33min 34s
Wall time: 1h 39min 56s


In [167]:
roc_auc_score(Y_test, preds_SVC)

0.50600400653257493

## LinearSVC

In [31]:
from sklearn.calibration import CalibratedClassifierCV

In [32]:
def perdictCalibratedProb(header, clf, param_list, param_names, n_iter_search, X, Y, Xv, metric='roc_auc'):
    pred_df=pd.DataFrame()
    df_header=[]
    
    for t in xrange(n_iter_search):
        newdict = getParamRandom(param_list,param_names)
        for p in newdict.keys():
            clf.__setattr__(p, newdict[p])
        clf.fit(X,Y)
        CCC=CalibratedClassifierCV(base_estimator=clf).fit(X,Y)
        #####
        df_header.append(clf.get_params())
        #####
        pred_df[header+`t`]=CCC.predict_proba(Xv)[:,1]
    
    # enregistrement ...
    pred_df.to_csv(header+".csv")
    DumpObject('Object_Parametre.'+header, df_header)
    
    return pred_df, df_header

In [33]:
C_list = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]
tol_list=[0.001, 0.0001,0.0002,0.0004,0.0009,0.001,0.0014,0.002,0.01, 0.1]
class_weight_list=[None, 'balanced']
max_iter_list=range(1000,2000, 90)
random_state_list = range(1,100,7)
######
param_list_LSVC = [C_list, tol_list, class_weight_list, max_iter_list, random_state_list]
param_names_LSVC = ["C", "tol", "class_weight", "max_iter", "random_state"]

In [34]:
clf_LSVC = LinearSVC()

In [186]:
%time preds_LSVC, params_LSVC = perdictCalibratedProb("LSVC", clf_LSVC, param_list_LSVC, param_names_LSVC, 100, X_train, Y_train, X_test)

CPU times: user 1h 44min 36s, sys: 43.3 s, total: 1h 45min 19s
Wall time: 1h 44min 42s


In [182]:
roc_auc_score(Y_test, preds_LSVC)

0.5559422852251743

## GradientBoostingClassifier

In [35]:
max_depth_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14, None]
n_estimators_list =[50, 70, 80, 100, 140, 170]
subsample_list = [ 0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]
max_features_list = range(1,19)
#warm_start_list = [False, True]
learning_rate_liste = [ 0.1 ,  0.26,  0.42,  0.58,  0.74,  0.9 ]
min_samples_split_list = range(2,21)
min_samples_leaf_list = range(1,21)
######
param_list_GBC = [max_depth_list, max_features_list, n_estimators_list, subsample_list, learning_rate_liste, min_samples_split_list, min_samples_leaf_list]
param_names_GBC = ['max_depth', 'max_features', 'n_estimators', 'subsample', 'learning_rate', 'min_samples_split', 'min_samples_leaf' ]

In [36]:
clf_GBC = GradientBoostingClassifier(random_state=1)

In [42]:
%time preds_GBC, params_GBC = perdictProb("GBC", clf_GBC, param_list_GBC, param_names_GBC, 100, X_train, Y_train, X_test)

CPU times: user 31min 27s, sys: 12.7 s, total: 31min 40s
Wall time: 49min 47s


In [163]:
listReplace=[]
for i in preds_GBC.columns.values:
    if pd.isnull(preds_GBC[i]).sum()!=0:
        print i,pd.isnull(preds_GBC[i]).sum()
        listReplace.append(i)
    
print listReplace

[]


In [37]:
def replacePredictNaN(header, clf, param_list, param_names, pred_df, df_header, listReplace, X, Y, Xv, metric='roc_auc'):
    
    for t in listReplace:
        newdict = getParamRandom(param_list,param_names)
        for p in newdict.keys():
            clf.__setattr__(p, newdict[p])
        clf.fit(X,Y)
        #####
        df_header[int(t.rsplit(header)[1])] = clf.get_params()
        #####
        pred_df[t]=clf.predict_proba(Xv)[:,1]
    
    return pred_df, df_header

In [155]:
%time pred_df_gbc, df_header_gbc = replacePredictNaN("GBC", clf_GBC, param_list_GBC, param_names_GBC, preds_GBC, params_GBC, listReplace, X_train, Y_train, X_test)

CPU times: user 1min 16s, sys: 1.24 s, total: 1min 17s
Wall time: 1min 18s


In [166]:
# enregistrement ...
pred_df_gbc.to_csv("GBC.csv")
DumpObject('Object_Parametre.'+"GBC", df_header_gbc)

## GradientBoostingRegressor

max_depth_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14, None]
n_estimators_list =[50, 70, 80, 100, 140, 170]
subsample_list = [ 0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]
max_features_list = range(1,19)
alpha_list=np.arange(0.0,1.1,0.01)
learning_rate_liste = [ 0.1 ,  0.26,  0.42,  0.58,  0.74,  0.9 ]
min_samples_split_list = range(2,21)
min_samples_leaf_list = range(1,21)
random_state_list = range(1,100,7)
######
param_list_GBR = [random_state_list, alpha_list, max_depth_list, max_features_list, n_estimators_list, subsample_list, learning_rate_liste, min_samples_split_list, min_samples_leaf_list]
param_names_GBR = ['random_state','alpha', 'max_depth', 'max_features', 'n_estimators', 'subsample', 'learning_rate', 'min_samples_split', 'min_samples_leaf' ]

clf_GBR = GradientBoostingRegressor()
%time preds_GBR, params_GBR = perdictCalibratedProb("GBR", clf_GBR, param_list_GBR, param_names_GBR, 1, X_train, Y_train, X_test)

In [38]:
from sklearn.neural_network import MLPClassifier

In [39]:
hidden_layer_sizes_list =[(100, ),(200, ),(300, ),(400, ),(500, ),(600, ),(700, )]
activation_list =['identity', 'logistic', 'tanh', 'relu']
solver_list =['adam', 'lbfgs', 'sgd']
alpha_list =np.arange(0.0001,0.02,0.0003)
learning_rate_list =['constant', 'invscaling', 'adaptive']
learning_rate_init_list =np.arange(0.001,0.01,0.0003)
power_t_list =np.arange(0.4,0.7,0.1)
max_iter_list =range(200,500,60)
random_state_list =range(0,101,7)
tol_list =np.arange(0.00001,0.001,0.00003)
momentum_list =np.arange(0.7,1.0,0.1)
nesterovs_momentum_list =[True, False]
beta_1_list =np.arange(0.8,1.0,0.01)
beta_2_list =np.arange(0.9,1.0,0.001)
epsilon_list =np.arange(1e-09,1e-07,3e-09)
######
param_list_MLPC = [hidden_layer_sizes_list, activation_list, solver_list, alpha_list, learning_rate_list,
                  learning_rate_init_list, power_t_list, max_iter_list, random_state_list, tol_list, momentum_list,
                 nesterovs_momentum_list, beta_1_list, beta_2_list, epsilon_list]
param_names_MLPC = ['hidden_layer_sizes', 'activation', 'solver', 'alpha', 'learning_rate',
 'learning_rate_init', 'power_t', 'max_iter', 'random_state', 'tol', 
 'momentum', 'nesterovs_momentum', 'beta_1', 'beta_2', 'epsilon' ]

In [40]:
clf_MLPC = MLPClassifier()

In [33]:
%time preds_MLPC, params_MLPC = perdictProb("MLPC", clf_MLPC, param_list_MLPC, param_names_MLPC, 100, X_train, Y_train, X_test)



CPU times: user 2h 4min 13s, sys: 22min 38s, total: 2h 26min 52s
Wall time: 4h 33min 26s


In [28]:
roc_auc_score(Y_test, preds_MLPC)

0.50071729573568668

In [14]:
preds_MLPC = pd.read_csv("MLPC.csv")
params_MLPC = LoadObject("Object_Parametre.MLPC")

In [23]:
listReplace=[]
for i in preds_MLPC.columns.values:
    if pd.isnull(preds_MLPC[i]).sum()!=0:
        print i,pd.isnull(preds_MLPC[i]).sum()
        listReplace.append(i)
    
print listReplace

[]


In [22]:
%time pred_df_MLPC, df_header_MLPC = replacePredictNaN("MLPC", clf_MLPC, param_list_MLPC, param_names_MLPC, preds_MLPC, params_MLPC, listReplace, X_train, Y_train, X_test)

CPU times: user 13min 16s, sys: 2min 26s, total: 15min 42s
Wall time: 9min 11s


In [24]:
# enregistrement ...
pred_df_MLPC.to_csv("MLPC.csv")
DumpObject('Object_Parametre.'+"MLPC", df_header_MLPC)

## SGDClassifier

In [41]:
from sklearn.linear_model import SGDClassifier

In [42]:
loss_list = ['log', 'modified_huber']
penalty_list = ['elasticnet']
alpha_list = [0.0001, 0.001, 0.01, 0.1]
learning_rate_list = ['constant', 'optimal']
n_iter_list = [2, 5, 7, 10]
eta0_list = [0.001, 0.01, 0.1]
class_weight_list=[None, "balanced"]

######
param_list_SGDC = [ penalty_list, alpha_list, learning_rate_list, n_iter_list, eta0_list, loss_list, class_weight_list]
param_names_SGDC = [ 'penalty', 'alpha', 'learning_rate', 'n_iter', 'eta0', 'loss', 'class_weight']

In [43]:
clf_SGDC = SGDClassifier(n_jobs=-1, random_state=1)

In [139]:
%time preds_SGDC, params_SGDC = perdictProb("SGDC", clf_SGDC, param_list_SGDC, param_names_SGDC, 100, X_train, Y_train, X_test)

CPU times: user 30.1 s, sys: 900 ms, total: 31.1 s
Wall time: 18.8 s


In [142]:
roc_auc_score(Y_test, preds_SGDC.SGDC14)

0.51412188352489807

# Combiner toutes les prédictons

In [44]:
AllModeles = ["DTC","KNC", "LR", "LSVC", "NB", "RFC", "SVC", "XT", "GBC", "MLPC", "SGDC"]
AllData = pd.read_csv("prediction/DTC.csv", index_col=0)
for m in AllModeles[1:]:
    print m
    temp = pd.read_csv("prediction/"+m+".csv", index_col=0)
    temp.columns.values
    AllData = pd.DataFrame( data=np.hstack((AllData, temp)) , columns=AllData.columns.values.tolist()+temp.columns.values.tolist() )
# enregistrement ...
AllData.to_csv("prediction/AllPredictionData.csv")

KNC
LR
LSVC
NB
RFC
SVC
XT
GBC
MLPC
SGDC


OmarPreds = pd.read_csv("prediction/FinalPredectionKNeighborsClassifier.csv")
AllData_Sr_Os = pd.DataFrame( data=np.hstack((AllData, OmarPreds)) , columns=AllData.columns.values.tolist()+OmarPreds.columns.values.tolist() )
AnouarPreds = pd.read_csv("prediction/all_prob_Classifier.csv")
AllData_Sr_Os_Aj = pd.DataFrame( data=np.hstack((AllData_Sr_Os, AnouarPreds)) , columns=AllData_Sr_Os.columns.values.tolist()+AnouarPreds.columns.values.tolist() )
#enregistrement ...
AllData_Sr_Os_Aj.to_csv("prediction/AllData_Sr_Os_Aj.csv")

In [45]:
AllData_Sr_Os_Aj=pd.read_csv("prediction/AllData_Sr_Os_Aj.csv", index_col=0)

## Split All Prediction Data to 50% x 50%

In [206]:
taille = int(len(Y_test)*0.5)
HalfPD1=pd.DataFrame()
HalfPD2=pd.DataFrame()

D1 = AllData_Sr_Os_Aj.head(1+taille) 
TY1 = pd.DataFrame(Y_test).head(1+taille)

HalfPD1 = pd.DataFrame( data=np.hstack((D1, TY1))
                       , columns=AllData_Sr_Os_Aj.columns.values.tolist()+TY1.columns.values.tolist() )


D2 = AllData_Sr_Os_Aj.tail(taille)
TY2 = pd.DataFrame(Y_test).tail(taille)

HalfPD2 = pd.DataFrame( data=np.hstack((D2, TY2))
                       , columns=AllData_Sr_Os_Aj.columns.values.tolist()+TY2.columns.values.tolist() )

HalfPD1.to_csv("prediction/HalfPD1.csv")
HalfPD2.to_csv("prediction/HalfPD2.csv")

In [211]:
print len(HalfPD2)+ len(HalfPD1)

19005


## Create a New Prediction data from last prediction by using this Stackers ["LR", "RFC", "XT", "GBC", "NB"]

In [46]:
def getBags(DFpred, Size_Bag):
    return sample(DFpred.columns.values, Size_Bag)

In [47]:
from sklearn.base import clone

def predBags(df_predict, Size_Bag, Yv=Y_test):
    stackers = ["LR", "RFC", "XT", "GBC", "NB"]
    NewPred = pd.DataFrame()
    taille = int(len(Yv)*0.5)
    for m in stackers:
        print m
        for t in range(40):
            tempDF = df_predict[getBags(df_predict, Size_Bag)]
            Half1 = tempDF.head(1+taille)
            Half2 = tempDF.tail(taille)
            stacker1 = getStacker(m)
            stacker2 = clone(stacker1)
            stacker1.fit( Half1, Yv.head(1+taille))
            stacker2.fit( Half2, Yv.head(taille))
            #stacker1.predict_proba(tempDF.tail(taille))[:,1]
            #stacker2.predict_proba(tempDF.head(1+taille))[:,1]
            NewPred["stacker_"+m+`t`] = np.array(stacker2.predict_proba(Half1)[:,1].tolist() + stacker1.predict_proba(Half2)[:,1].tolist())
            
    return NewPred

def ExtraPredBags(df_predict, Size_Bag, Yv=Y_test, xtra=3):
    stackers = ["LR", "RFC", "XT", "GBC", "NB"]
    NewPred = pd.DataFrame()
    taille = int(len(Yv)*0.5)
    for m in stackers:
        print m
        for t in range(40):
            dataList = np.zeros_like(Yv)
            tempDF = df_predict[getBags(df_predict, Size_Bag)]
            Half1 = tempDF.head(1+taille)
            Half2 = tempDF.tail(taille)
            for x in range(xtra):
                stacker1 = getStacker(m)
                stacker2 = clone(stacker1)
                stacker1.fit( Half1, Yv.head(1+taille))
                stacker2.fit( Half2, Yv.head(taille))
                #stacker1.predict_proba(tempDF.tail(taille))[:,1]
                #stacker2.predict_proba(tempDF.head(1+taille))[:,1]
                dataList = dataList + np.array(stacker2.predict_proba(Half1)[:,1].tolist() + stacker1.predict_proba(Half2)[:,1].tolist())
            NewPred["stacker_"+m+`t`] = dataList/xtra
            
    return NewPred

def BestPredBags(df_predict, Size_Bag, Yv=Y_test, best=3):
    stackers = ["LR", "RFC", "XT", "GBC", "NB"]
    NewPred = pd.DataFrame()
    taille = int(len(Yv)*0.5)
    for m in stackers:
        print m
        for t in range(40):
            bestList = np.array([])
            score = 0
            tempDF = df_predict[getBags(df_predict, Size_Bag)]
            Half1 = tempDF.head(1+taille)
            Half2 = tempDF.tail(taille)
            for x in range(best):
                stacker1 = getStacker(m)
                stacker2 = clone(stacker1)
                stacker1.fit( Half1, Yv.head(1+taille))
                stacker2.fit( Half2, Yv.head(taille))
                dataList = np.array(stacker2.predict_proba(Half1)[:,1].tolist() + stacker1.predict_proba(Half2)[:,1].tolist())
                if roc_auc_score(Yv, dataList) > score:
                    bestList=dataList
            
            NewPred["stacker_"+m+`t`] = bestList
            
    return NewPred

In [48]:
def getStacker(nom):
    if nom=="LR":
        stacker = linear_model.LogisticRegression(n_jobs=-1, random_state=1)
        newdict = getParamRandom(param_list_LR,param_names_LR)
        for p in newdict.keys():
            stacker.__setattr__(p, newdict[p])
        return stacker
    
    elif nom=="RFC":
        stacker = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
        newdict = getParamRandom(param_list_RFC,param_names_RFC)
        for p in newdict.keys():
            stacker.__setattr__(p, newdict[p])
        return stacker
    
    elif nom=="XT":
        stacker = ExtraTreesClassifier(n_jobs=-1, random_state=5)
        newdict = getParamRandom(param_list_XT,param_names_XT)
        for p in newdict.keys():
            stacker.__setattr__(p, newdict[p])
        return stacker
    
    elif nom=="GBC":
        stacker = GradientBoostingClassifier(random_state=1)
        newdict = getParamRandom(param_list_GBC,param_names_GBC)
        for p in newdict.keys():
            stacker.__setattr__(p, newdict[p])
        return stacker
    
    elif nom=="NB":
        stacker = naive_bayes.BernoulliNB()
        newdict = getParamRandom(param_list_NB,param_names_NB)
        for p in newdict.keys():
            stacker.__setattr__(p, newdict[p])
        return stacker
        


In [75]:
%time NewPred200 = predBags(AllData_Sr_Os_Aj, 200)

LR
RFC
XT
GBC
NB
CPU times: user 16min 44s, sys: 24.3 s, total: 17min 9s
Wall time: 9min 50s


In [83]:
NewPred200.to_csv("prediction/NewPred200.csv")

In [47]:
%time ExtraPred200 = ExtraPredBags(AllData_Sr_Os_Aj, Size_Bag=200, xtra=10)

LR
RFC
XT
GBC
NB
CPU times: user 3h 42s, sys: 3min 11s, total: 3h 3min 53s
Wall time: 1h 42min 3s


In [48]:
ExtraPred200.to_csv("prediction/ExtraPred200.csv")

In [93]:
%time BestPred200 = BestPredBags(AllData_Sr_Os_Aj, Size_Bag=200, best=10)

LR
RFC
XT
GBC
NB
CPU times: user 3h 1min 36s, sys: 3min 41s, total: 3h 5min 18s
Wall time: 1h 46min 53s


In [94]:
BestPred200.to_csv("prediction/BestPred200.csv")

#### NewPred200 : 0.773521991717 : stacker_XT14
#### ExtraPred200 : 0.753228160753 : stacker_XT25
#### BestPred200 : 0.766892376898 : stacker_NB15

In [50]:
def bestScore(DataF):
    score = 0
    emp = ''
    for i in DataF.columns.values:
        score_temp = roc_auc_score(Y_test,DataF[i])
        if score<score_temp:
            score = score_temp
            emp = i
    print score, emp

In [101]:
bestScore(ExtraPred200)

0.753228160753 stacker_XT25


In [95]:
od1=pd.read_csv("FinalEnsembleCSVExtraThreeLinear.csv")
od1.drop(["StackExtraTreesClassifier:0","StackExtraTreesClassifier:1"], axis=1, inplace=True)
od2=pd.read_csv("FinalEnsembleLogisticRegression.csv")
od3=pd.read_csv("GMBLASTCSV.csv")
omar = pd.concat([od1, od2, od3], axis=1 )


In [77]:
od4=pd.read_csv("all_prob_Classifier_Stack.csv")

0.837192198157 ET_probstack32


In [97]:
omar_anouar =pd.concat([omar, od4], axis=1 )

In [99]:
%time Scores_list, selected_col = ForWardSelection(omar_anouar, 500,iter=100)

Completed  .....  0.0 %
_______________________
Start :  0.778346855857
End   :  0.782967155437 
Completed  .....  1.0 %
_______________________
Start :  0.829917435104
End   :  0.833893319655 
Completed  .....  11.0 %
_______________________
Start :  0.830899784921
End   :  0.839875036715 
CPU times: user 4min 14s, sys: 21.5 s, total: 4min 36s
Wall time: 4min 22s


In [90]:
print selected_col[-1]

[9, 165, 235, 197, 178, 0, 143, 127, 127, 119, 119, 119, 116, 116, 125]


In [None]:
omar
##############
Start :  0.837383748306
End   :  0.841068989875 
[13, 19, 19, 13, 30, 28, 22, 22, 25, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7,
 7, 7, 7, 7, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 7, 7, 7, 7, 2, 2, 2,
 7, 7, 7, 7, 2, 2, 2, 2, 7, 7, 2, 2, 7, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7,
 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2]

omar_anouar
##############
Start :  0.502424859917
End   :  0.841547333673 
[9, 165, 235, 197, 178, 0, 143, 127, 127, 119, 119, 119, 116, 116, 125]

## selection with remplacement 

In [257]:
#NewPred200=pd.read_csv("prediction/NewPred200.csv", index_col=0)
#ExtraPred200=pd.read_csv("prediction/ExtraPred200.csv", index_col=0)
#BestPred200=pd.read_csv("prediction/BestPred200.csv", index_col=0)
AllPredictionData=pd.read_csv("prediction/AllPredictionData.csv", index_col=0)

In [49]:
def ForWardSelection(DataF, nb_mod, iter=20):
    Scores_top = [[0,0]]
    selected_col_list = []
    for it in range(iter):
        Scores = []
        selected = []
        comb = 0
        for i in range(3):
            col = sample(DataF.columns.values, 1)[0]
            #col = "stacker_XT14"
            comb = comb + DataF[col]
            selected.append(col)
        Scores.append(roc_auc_score(Y_test, comb/len(selected)))

        #Scores = [0]
        #selected = []
        #comb = 0

        #print "Start : ", Scores[len(Scores)-1]  
        for nb in range(nb_mod):
            col = sample(DataF.columns.values, 1)[0]
            if len(selected)==0:
                temp = (DataF[col]+comb)
            else:
                temp = (DataF[col]+comb)/len(selected)

            score_temp = roc_auc_score(Y_test, temp)
            if Scores[len(Scores)-1]<=score_temp:
                Scores.append(score_temp)
                selected.append(col)
                comb = comb + temp
        #print "End   : ", Scores[len(Scores)-1],"\n============================="
        if Scores[len(Scores)-1]>Scores_top[len(Scores_top)-1][1]:
            Scores_top.append([Scores[0], Scores[len(Scores)-1]])
            selected_col_list.append(selected)
            print "Completed  ..... ", ((it+0.0)/iter)*100.0, "%\n_______________________"
            print "Start : ", Scores[0]
            print "End   : ", Scores[len(Scores)-1],"\n============================="
        
    return Scores_top, selected_col_list
    

In [363]:
tempData = pd.DataFrame( data=np.hstack((NewPred200, ExtraPred200, BestPred200)) )

In [368]:
%time Scores_list, selected_col = ForWardSelection(tempData, 2000,iter=100)

Completed  .....  0.0 %
_____________________
Start :  0.727320095733
End   :  0.779578129218 
Completed  .....  3.0 %
_____________________
Start :  0.726372059971
End   :  0.782363066756 
Completed  .....  11.0 %
_____________________
Start :  0.731823632207
End   :  0.784136839467 
Completed  .....  21.0 %
_____________________
Start :  0.728412499819
End   :  0.786090174405 
Completed  .....  26.0 %
_____________________
Start :  0.7607042256
End   :  0.789740075429 
CPU times: user 15min 57s, sys: 1min 21s, total: 17min 18s
Wall time: 16min 28s


In [337]:
selected_col[-1]

["('RandoFor_prob', 12)",
 "('DT_pr', 230)",
 'RFC3',
 'RFClassifier:336',
 'SVR:16',
 "('RandoFor_prob', 231)",
 'RFClassifier:38',
 'GBMClassifier:67',
 "('RandoFor_prob', 5)",
 'RFClassifier:6',
 'KNC98',
 "('RandoFor_prob', 45)",
 'RFC55',
 'KNC17',
 'LR17',
 'GBMClassifier:13',
 'RFClassifier:55',
 'GBMClassifier:143',
 'RFClassifier:8',
 'RFClassifier:8',
 'GBMClassifier:71',
 'GBMClassifier:112']

#### AllPredictionData
    
    ###############
    Scores
    Start :  0.83540137839853967
    End   :  0.83957530197468599
    
    Selected_col
    ['RFC72', 'RFC70', 'LSVC55', 'SGDC68', 'GBC29', 'GBC58', 'DTC51', 'XT53', 'DTC59', 'DTC92', 'DTC92', 'DTC21', 'GBC34', 'GBC34', 'RFC69', 'XT22', 'XT13', 'DTC15', 'RFC55', 'XT53', 'XT53', 'DTC56', 'GBC2', 'DTC37', 'KNC86', 'KNC57', 'KNC29', 'GBC51']
    
#### AllData_Sr_Os_Aj
    
    ###############
    Scores
    Start :  0.830148431752
    End   :  0.840173231723
    
    Selected_col
    ["('RandoFor_prob', 12)", "('DT_pr', 230)", 'RFC3', 'RFClassifier:336', 'SVR:16', "('RandoFor_prob', 231)", 'RFClassifier:38', 'GBMClassifier:67', "('RandoFor_prob', 5)", 'RFClassifier:6', 'KNC98', "('RandoFor_prob', 45)", 'RFC55', 'KNC17', 'LR17', 'GBMClassifier:13', 'RFClassifier:55', 'GBMClassifier:143', 'RFClassifier:8', 'RFClassifier:8', 'GBMClassifier:71', 'GBMClassifier:112']
    

In [26]:
import xgboost as xgb