In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics, linear_model, naive_bayes
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder 
from scipy import sparse
from mlxtend.classifier import StackingClassifier
from sklearn.cross_validation import train_test_split

In [2]:
#Chargement du fichier train.csv dans le dataframe df
df = pd.read_csv("train.csv")

In [3]:
#fonction de prédiction sur le dataset de validation
def validation(clf1, Xv, Yv):
    preds = clf1.predict_proba(Xv)[:,1]
    return roc_auc_score(Yv, preds)

In [4]:
#regroupe les modalités des variables rares, pour un seuil donné 
def regroupe(df, column, seuil):
    df_temp=df.copy()
    if seuil==0:
        return df_temp
    else:
        #if (df_temp[column].count()*2.5/100) <= (np.array(df_temp[column].value_counts()==seuil ,dtype=int).sum()):
        #Valeurs les plus communes
        ss = pd.DataFrame(data=df_temp[column].value_counts())
        selected = ss[column][ss[column]<=seuil]
        column_num = df_temp.columns.get_loc(column)
        for x in np.array(selected.index) :
            for line_num in df_temp[column][df_temp[column]==x].index.values:
                df_temp.set_value(line_num, column_num, np.array(selected.index)[0], takeable=True)
        #print column,"seuil=",seuil
        return df_temp

## Echantillonnage de données

In [5]:
def splitData(df):
    Y = df.ACTION
    X = df.drop(['ACTION'], axis=1)

    # diviser X et Y en training and testing
    from sklearn.cross_validation import train_test_split
    return  train_test_split(X, Y, test_size=0.25,random_state=1)

## <span style="color:#b36c8f">On supprime la colonne 'ROLE_CODE' et on garde 'ROLE_TITLE' <br/>(celle qui nous donne une meilleure score)</span>

In [6]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY']

<br/><br/><br/>

## RandomForestClassifier

##### Reduction du modalités des colonnes de DataFrame en fonction du seuil  

In [116]:
temp = regroupe(df, "RESOURCE", 1)
temp = regroupe(temp, "ROLE_TITLE", 0)
temp = regroupe(temp, "MGR_ID", 1)
temp = regroupe(temp, "ROLE_ROLLUP_2", 2)
temp = regroupe(temp, "ROLE_FAMILY_DESC", 24)
temp = regroupe(temp, "ROLE_FAMILY", 16)
temp = regroupe(temp, "ROLE_DEPTNAME", 52)
temp = regroupe(temp, "ROLE_ROLLUP_1", 5)

Xr_train, Xr_test, Yr_train, Yr_test = splitData(temp)

In [117]:
modele_RF = RandomForestClassifier(n_estimators=1040, min_samples_split=9, n_jobs=-1, random_state=42)
%time modele_RF.fit(Xr_train[var], Yr_train)

CPU times: user 44.1 s, sys: 735 ms, total: 44.8 s
Wall time: 16.3 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1040, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [118]:
print "Resultat d'apprentissage : ",validation(modele_RF, Xr_train[var], Yr_train)
print "Resultat de validation : ",validation(modele_RF, Xr_test[var], Yr_test)

Resultat d'apprentissage :  0.993743210163
Resultat de validation :  0.873741614254


<br/><br/><br/>

## AdaBoostClassifier      ---

In [119]:
modele_RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=9, n_jobs=-1, random_state=42)
modele_ABC = AdaBoostClassifier(base_estimator = modele_RFC, random_state=1, learning_rate=1.0)
%time modele_ABC.fit(Xr_train[var], Yr_train)

CPU times: user 40min 42s, sys: 1min 27s, total: 42min 10s
Wall time: 15min


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=1)

In [120]:
%time print "Resultat de validation : ",validation(modele_ABC, Xr_test[var], Yr_test)

Resultat de validation :  0.86848314748
CPU times: user 2min 15s, sys: 59.3 s, total: 3min 14s
Wall time: 2min 23s


<br/><br/><br/>

## AdaBoostClassifier  ===

In [121]:
X_train, X_test, Y_train, Y_test = splitData(df)

In [122]:
modele_RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=9, n_jobs=-1, random_state=42)
modele_ABCn = AdaBoostClassifier(base_estimator = modele_RFC, random_state=1, learning_rate=1.0)
%time modele_ABCn.fit(X_train[var], Y_train)

CPU times: user 3min 58s, sys: 7.14 s, total: 4min 5s
Wall time: 1min 25s


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=1)

In [123]:
%time print "Resultat de validation : ",validation(modele_ABCn, X_test[var], Y_test)

Resultat de validation :  0.870251104362
CPU times: user 10 s, sys: 2.45 s, total: 12.5 s
Wall time: 5.91 s


<br/><br/><br/>

## ExtraTreesClassifier

In [124]:
modele_XT =ExtraTreesClassifier(n_estimators=1020, min_samples_split=8, n_jobs=-1, random_state=5)
%time modele_XT.fit(Xr_train[var], Yr_train)

CPU times: user 28.9 s, sys: 1.34 s, total: 30.2 s
Wall time: 12.7 s


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=1020, n_jobs=-1,
           oob_score=False, random_state=5, verbose=0, warm_start=False)

In [125]:
print "Resultat d'apprentissage' : ",validation(modele_XT, Xr_train[var], Yr_train)
print "Resultat de validation : ",validation(modele_XT, Xr_test[var], Yr_test)

Resultat d'apprentissage' :  0.994458806083
Resultat de validation :  0.863034263857


<br/><br/><br/>

## GradientBoostingClassifier

In [126]:
modele_GB =GradientBoostingClassifier(n_estimators=90, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=1)
%time modele_GB.fit(Xr_train[var], Yr_train)

CPU times: user 42.8 s, sys: 347 ms, total: 43.2 s
Wall time: 43.9 s


GradientBoostingClassifier(init=None, learning_rate=0.2, loss='deviance',
              max_depth=20, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=9,
              min_weight_fraction_leaf=0.0, n_estimators=90,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [127]:
print "Resultat de validation : ",validation(modele_GB, Xr_test[var], Yr_test)

Resultat de validation :  0.858124917924


<br/><br/><br/>
# StackingClassifier

In [None]:
modele_RF = RandomForestClassifier(n_estimators=1040, min_samples_split=9, n_jobs=-1, random_state=42)
modele_RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=9, n_jobs=-1, random_state=42)
modele_ABC = AdaBoostClassifier(base_estimator = modele_RFC, random_state=1, learning_rate=1.0)
modele_XT =ExtraTreesClassifier(n_estimators=1020, min_samples_split=8, n_jobs=-1, random_state=5)
modele_GB =GradientBoostingClassifier(n_estimators=90, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=1)


In [9]:
lr = linear_model.LogisticRegression(C=2, intercept_scaling=1, random_state=1, n_jobs=-1)
stkclf = StackingClassifier(classifiers=[modele_RF, modele_ABC, modele_XT, modele_GB], 
                          meta_classifier=lr, use_probas=True)
%time stkclf = stkclf.fit(Xr_train[var], Yr_train)

CPU times: user 47min 41s, sys: 3min 3s, total: 50min 44s
Wall time: 20min 24s


In [10]:
print "Resultat de validation = ",validation(stkclf, Xr_test[var], Yr_test)
print "Resultat de validation =  0.872794559359    C=2, intercept_scaling=1, random_state=1, n_jobs=-1"

Resultat de validation =  0.872794559359
Resultat de validation =  0.866552854297    C=2, intercept_scaling=1, random_state=1, n_jobs=-1


<br/><br/><br/>
# Soft VotingClassifier

In [15]:
from itertools import product
from sklearn.ensemble import VotingClassifier


eclf = VotingClassifier(estimators=[('RF', modele_RF), ('ABC', modele_ABC), ('XT', modele_XT), ('GB', modele_GB)], voting='soft', 
                        weights=[29,30,28,2])
%time eclf = eclf.fit(Xr_train[var], Yr_train)

CPU times: user 42min 5s, sys: 1min 19s, total: 43min 25s
Wall time: 16min 5s


In [16]:
print "Resultat de validation = ",validation(eclf, Xr_test[var], Yr_test)

Resultat de validation =  0.872825992808


In [17]:
test = pd.read_csv('test.csv', index_col=0)
preds = stkclf.predict_proba(test[var])[:,1]
submissions = pd.DataFrame(data=preds, columns=["ACTION"], index = test.index)
submissions.to_csv("sampleSubmission.csv")

<br/><br/><br/>
# DataFrame probabilité

In [128]:
pred_df=pd.DataFrame(columns=['ABCn','RF','ABC','XT','GB'])
pred_df['ABCn']=modele_ABCn.predict_proba(df[var])[:,1]
pred_df['RF']=modele_RF.predict_proba(df[var])[:,1]
pred_df['ABC']=modele_ABC.predict_proba(df[var])[:,1]
pred_df['XT']=modele_XT.predict_proba(df[var])[:,1]
pred_df['GB']=modele_GB.predict_proba(df[var])[:,1]

In [129]:
pred_TEST=pd.DataFrame(columns=['ABCn','RF','ABC','XT','GB'])
pred_TEST['ABCn']=modele_ABCn.predict_proba(test[var])[:,1]
pred_TEST['RF']=modele_RF.predict_proba(test[var])[:,1]
pred_TEST['ABC']=modele_ABC.predict_proba(test[var])[:,1]
pred_TEST['XT']=modele_XT.predict_proba(test[var])[:,1]
pred_TEST['GB']=modele_GB.predict_proba(test[var])[:,1]

In [130]:
pred_TEST.to_csv("CV.csv")

In [131]:
pred_df.to_csv("TR.csv")

<br/><br/><br/><br/><br/>

## Sparce

In [384]:
test = pd.read_csv('test.csv', index_col=0)
train = pd.read_csv('train.csv') 
input_cols = train.columns[1:-1] # don't need first (label) or last (duplicate)
Y = train.ACTION
print "These are the inputs used: ",input_cols.values
print "\n"

all_data = np.vstack((train[input_cols.values], test[input_cols.values]))
train_rows = len(train)
print "Combined test, train data rows and columns: ",all_data.shape
print "\n"
all_data=pd.DataFrame(data=all_data, columns=input_cols)

goodVar= ['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_ROLLUP_2', 'RESOURCE', 'ROLE_TITLE']


These are the inputs used:  ['RESOURCE' 'MGR_ID' 'ROLE_ROLLUP_1' 'ROLE_ROLLUP_2' 'ROLE_DEPTNAME'
 'ROLE_TITLE' 'ROLE_FAMILY_DESC' 'ROLE_FAMILY']


Combined test, train data rows and columns:  (91690, 8)




In [387]:
"""temp = regroupe(all_data, "RESOURCE", 1)
temp = regroupe(temp, "ROLE_TITLE", 0)
temp = regroupe(temp, "MGR_ID", 1)
temp = regroupe(temp, "ROLE_ROLLUP_2", 2)
temp = regroupe(temp, "ROLE_FAMILY_DESC", 24)
temp = regroupe(temp, "ROLE_FAMILY", 16)
temp = regroupe(temp, "ROLE_DEPTNAME", 52)
tempxt = regroupe(temp, "ROLE_ROLLUP_1", 5)
['MGR_ID',  'ROLE_FAMILY_DESC', 'RESOURCE', 'ROLE_TITLE']
"""
temp = regroupe(all_data, "RESOURCE", 1)
temp = regroupe(temp, "MGR_ID", 1)

In [386]:
#temp.to_csv("temp.csv")

In [363]:
X_all = OneHotEncoder().fit_transform(temp[goodVar].astype(str))
X_train_all = X_all[:train_rows,:]
X_test_all = X_all[train_rows:,:]
X_train_sparce, X_test_sparce, Y_train_sparce, Y_test_sparce = train_test_split(X_train_all, Y, test_size=0.25,random_state=1)

<h3><span style="color: #c36b0f;">LogisticRegression</span></h3>

    Best Score :  [0.86013526162408949, 1, 1, 2, 0, 0, 0] RL ~ 3h22min
    ['ROLE_ROLLUP_2'*1, 'RESOURCE'*1, 'ROLE_TITLE'*2,'MGR_ID'*0, 'ROLE_DEPTNAME'*0, 'ROLE_FAMILY_DESC'*0]

In [364]:
modele_LR = linear_model.LogisticRegression(C=2.602, n_jobs=-1, intercept_scaling=2.21, random_state=1, solver='newton-cg')
modele_LR.fit(X_train_sparce, Y_train_sparce)

LogisticRegression(C=2.602, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=2.21, max_iter=100, multi_class='ovr',
          n_jobs=-1, penalty='l2', random_state=1, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [365]:
print "Resultat de validation : ",validation(modele_LR, X_test_sparce, Y_test_sparce)
print "Resultat de validation :  0.8637510862"

Resultat de validation :  0.859498350093
Resultat de validation :  0.8637510862


In [188]:
naive_bayes.BernoulliNB?

<h3><span style="color: #c36b0f;">Naive Bayes</span></h3>

In [366]:
modele_NB = naive_bayes.BernoulliNB(alpha=0.045)
%time modele_NB.fit(X_train_sparce, Y_train_sparce)

CPU times: user 11.3 ms, sys: 5.73 ms, total: 17.1 ms
Wall time: 22.6 ms


BernoulliNB(alpha=0.045, binarize=0.0, class_prior=None, fit_prior=True)

In [367]:
print "Resultat de validation : ",validation(modele_NB, X_test_sparce, Y_test_sparce)
print "Resultat de validation :  0.839242216379"

Resultat de validation :  0.836367802089
Resultat de validation :  0.839242216379


### ExtraTreesClassifier

In [368]:
XT = ExtraTreesClassifier(n_estimators=190, min_samples_split=5, min_samples_leaf=0.01, criterion="entropy", n_jobs=-1, random_state=5)
%time XT.fit(X_train_sparce, Y_train_sparce)

CPU times: user 5min 18s, sys: 1.43 s, total: 5min 19s
Wall time: 1min 33s


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=0.01, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=190, n_jobs=-1,
           oob_score=False, random_state=5, verbose=0, warm_start=False)

In [369]:
print "Resultat de validation : ",validation(XT, X_test_sparce, Y_test_sparce)
print "Resultat de validation :  0.873602468853"

Resultat de validation :  0.877478702092
Resultat de validation :  0.873602468853


<h3><span style="color: #3366ff;">Modèle : SVM</span></h3>

In [370]:
#modele_SVC = grid_search.best_estimator_
from sklearn.svm import LinearSVC, SVC
modele_SVC = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)
%time modele_SVC.fit(X_train_sparce, Y_train_sparce)

CPU times: user 12min 18s, sys: 5.44 s, total: 12min 23s
Wall time: 12min 39s


SVC(C=9, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
  verbose=False)

In [371]:
print "Resultat de validation : ", validation(modele_SVC, X_test_sparce, Y_test_sparce)
print "best_params =  {'C': 9, 'gamma': 0.5}   0.863145608118  [1, 1, 2, 0, 0, 0]orders  Wall time: 3h 27min 27s"

Resultat de validation :  0.863020153731
best_params =  {'C': 9, 'gamma': 0.5}   0.863145608118  [1, 1, 2, 0, 0, 0]orders  Wall time: 3h 27min 27s


<h3><span style="color: #3366ff;">KNeighborsClassifier</span></h3>

In [372]:
from sklearn.neighbors import KNeighborsClassifier
modele_KN = KNeighborsClassifier(n_neighbors=15, n_jobs=-1, weights='distance')
modele_KN.fit(X_train_sparce, Y_train_sparce)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
           weights='distance')

In [373]:
%time print "Resultat de validation : ",validation(modele_KN, X_test_sparce, Y_test_sparce)
print "\nResultat de validation :  0.83616313541   n_neighbors=15, n_jobs=-1, weights='distance'"

Resultat de validation :  0.834816386746
CPU times: user 6.62 s, sys: 18.6 s, total: 25.2 s
Wall time: 1min 8s

Resultat de validation :  0.83616313541   n_neighbors=15, n_jobs=-1, weights='distance'


# Sparse Soft Voting

In [399]:
from sklearn.ensemble import VotingClassifier

#modele_LR = linear_model.LogisticRegression(C=2.602, n_jobs=-1, intercept_scaling=2.21, random_state=1, solver='newton-cg')
#modele_NB = naive_bayes.BernoulliNB(alpha=0.045)
#XT = ExtraTreesClassifier(n_estimators=190, min_samples_split=5, min_samples_leaf=0.01, criterion="entropy", n_jobs=-1, random_state=5)
#modele_SVC = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)
#modele_KN = KNeighborsClassifier(n_neighbors=15, n_jobs=-1, weights='distance')

#, ('XT', XT)


sclf = VotingClassifier(estimators=[('NB', modele_NB), ('LR', modele_LR), ('SVC', modele_SVC), ('XT', XT)], voting='soft', 
                        weights=[1,10.1111,24,16])

%time sclf = sclf.fit(X_train_all, Y)

CPU times: user 25min 24s, sys: 8.43 s, total: 25min 32s
Wall time: 20min 48s



    LR   0.8637510862      15.1111
    NB   0.838180464321    1
    XT   0.873602468853    19
    SVM  0.858805137762    10

    KN   0.83616313541

In [397]:
print "Resultat de validation = ",validation(sclf, X_test_sparce, Y_test_sparce)
print "\n\nResultat de validation = ",0.881645659809," weights=[1,10.1111,24,16] ","\nRESOURCE seuil= 1 | MGR_ID seuil= 1 | goodVar | \nWall time: 12min 7s"

Resultat de validation =  0.881671784498


Resultat de validation =  0.881645659809  weights=[1,10.1111,23.5,16]  
RESOURCE seuil= 1 | MGR_ID seuil= 1 | goodVar | 
Wall time: 12min 7s


# StackingClassifier

In [None]:
#modele_LR = linear_model.LogisticRegression(C=2.602, n_jobs=-1, intercept_scaling=2.21, random_state=1, solver='newton-cg')
#modele_NB = naive_bayes.BernoulliNB(alpha=0.045)
#XT = ExtraTreesClassifier(n_estimators=190, min_samples_split=5, min_samples_leaf=0.01, criterion="entropy", n_jobs=-1, random_state=5)
#modele_SVC = SVC(probability=True, random_state=1, C= 9, gamma= 0.5)
#modele_KN = KNeighborsClassifier(n_neighbors=15, n_jobs=-1, weights='distance')


In [381]:
lr = linear_model.LogisticRegression(C=2, intercept_scaling=1, random_state=1, n_jobs=-1)
stksclf = StackingClassifier(classifiers=[modele_LR, modele_NB, modele_SVC, XT], 
                          meta_classifier=lr, use_probas=True)
%time stksclf = stksclf.fit(X_train_sparce, Y_train_sparce)

CPU times: user 19min 1s, sys: 10.6 s, total: 19min 12s
Wall time: 16min 15s


In [382]:
print "Resultat de validation = ",validation(stksclf, X_test_sparce, Y_test_sparce)
print "0.870534005404"

Resultat de validation =  0.877274175116
0.870534005404


# Submit Num

In [400]:
preds = sclf.predict_proba(X_test_all)[:,1]
submissions = pd.DataFrame(data=preds, columns=["ACTION"], index = test.index)
submissions.to_csv("STsampleSubmission.csv")

"""0.89803"""

In [391]:
preds = stksclf.predict_proba(X_test_all)[:,1]
submissions = pd.DataFrame(data=preds, columns=["ACTION"], index = test.index)
submissions.to_csv("F2sampleSubmission.csv")

<br/><br/><br/>
# DataFrame probabilité

In [347]:
pred_train=pd.DataFrame(columns=['LR','NB','SVC','XT','KN'])
pred_train['LR']=modele_LR.predict_proba(X_train_all)[:,1]
pred_train['NB']=modele_NB.predict_proba(X_train_all)[:,1]
pred_train['SVC']=modele_SVC.predict_proba(X_train_all)[:,1]
pred_train['XT']=XT.predict_proba(X_train_all)[:,1]
#pred_train['KN']=modele_KN.predict_proba(X_train_all)[:,1]

In [348]:
pred_train.to_csv("TR_num.csv")

In [349]:
pred_test=pd.DataFrame(columns=['LR','NB','SVC','XT','KN'])
pred_test['LR']=modele_LR.predict_proba(X_test_all)[:,1]
pred_test['NB']=modele_NB.predict_proba(X_test_all)[:,1]
pred_test['SVC']=modele_SVC.predict_proba(X_test_all)[:,1]
pred_test['XT']=XT.predict_proba(X_test_all)[:,1]
#pred_test['KN']=modele_KN.predict_proba(X_test_all)[:,1]

In [350]:
pred_test.to_csv("TS_num.csv")

In [401]:


pred_vot_train=pd.DataFrame(columns=['Voting'])
pred_vot_test=pd.DataFrame(columns=['Voting'])

pred_vot_train['Voting'] = sclf.predict_proba(X_train_all)[:,1]
pred_vot_test['Voting'] = sclf.predict_proba(X_test_all)[:,1]
pred_vot_test.to_csv("TS_vot_num.csv")
pred_vot_train.to_csv("TR_vot_num.csv")


In [450]:
TR_vot_num = pd.read_csv('TR_vot_num.csv',  index_col=0)
TS_vot_num = pd.read_csv('TS_vot_num.csv',  index_col=0)

In [484]:
TS = pd.read_csv("CV.csv",  index_col=0)

TR = pd.read_csv("TR.csv",  index_col=0)

In [485]:
pre_tr = pd.concat([TR_vot_num, TR], axis=1)
pre_ts = pd.concat([TS_vot_num, TS], axis=1)

In [489]:
pre_tr.head()

Unnamed: 0,Voting,ABCn,RF,ABC,XT,GB
0,0.98989,0.984622,0.997622,0.928572,0.998567,0.999992
1,0.988036,0.963648,0.955832,0.88702,0.966326,0.999962
2,0.977956,0.89731,0.908988,0.799879,0.988525,0.99997
3,0.989914,0.975305,0.980117,0.890264,0.987551,0.999871
4,0.98007,0.952147,0.975461,0.869679,0.947103,0.999998


In [490]:
from sklearn.cross_validation import train_test_split

X_tr, X_ts, Y_tr, Y_ts = train_test_split(pre_tr, Y, test_size=0.25,random_state=1)

In [652]:
LogReg = linear_model.LogisticRegression(n_jobs=-1, random_state=1, C=2, intercept_scaling=1)

In [653]:
#LogReg = LogReg.fit(TR_vot_num,Y)
LogReg = LogReg.fit(X_tr,Y_tr)

In [654]:
print "Apprentissage : ",validation(LogReg, X_tr, Y_tr)
print "Validation : ",validation(LogReg, X_ts, Y_ts)

Apprentissage :  0.999988699928
Validation :  0.994113003948


In [655]:
LogReg = LogReg.fit(pre_tr,Y)
pred_LogReg = LogReg.predict_proba(pre_ts)[:,1]

In [656]:
submissions = pd.DataFrame(data=pred_LogReg, columns=["ACTION"], index = test.index)
submissions.to_csv("logSubmission.csv")

In [346]:
kn = modele_KN.predict_proba(X_test_all)[:,1]

MaybeEncodingError: Error sending result: '[array([[ 8.,  8.,  8., ...,  8.,  8.,  8.],
       [ 8.,  8.,  8., ...,  8.,  8.,  8.],
       [ 8.,  8.,  8., ...,  8.,  8.,  8.],
       ..., 
       [ 8.,  8.,  8., ...,  8.,  8.,  8.],
       [ 6.,  8.,  8., ...,  8.,  6.,  4.],
       [ 8.,  8.,  4., ...,  8.,  8.,  8.]])]'. Reason: 'SystemError('error return without exception set',)'