In [2]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics, linear_model, naive_bayes

In [338]:
#Chargement du fichier dataProjet.csv dans le dataframe df, n'oubliez pas de modifier le chemin 
#pour tenir compte de l'endroit ou se trouve votre fichier
df = pd.read_csv("train.csv")

In [339]:
#regroupe les modalités des variables rares, pour un seuil donné 
def regroupe(df, column, seuil):
    
    #Valeurs les plus communes
    ss = pd.DataFrame(data=df[column].value_counts())
    selected = ss[column][ss[column]==1]
    column_num = df.columns.get_loc(column)
    for x in np.array(selected.index) :
        for line_num in df[column][df[column]==x].index.values:
            df.set_value(line_num, column_num, np.array(selected.index)[0], takeable=True)
        

In [340]:
regroupe(df, "RESOURCE", 1)
regroupe(df, "RESOURCE", 2)
regroupe(df, "MGR_ID", 1)

## Echantillonnage de données

In [341]:
Y = df.ACTION
X = df.drop(['ACTION'], axis=1)

# diviser X et Y en training and testing
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=1)

df1 = pd.DataFrame(Y_train.value_counts())
df1['Percentage'] = 100*df1['ACTION']/len(Y_train)
pr1=100*len(Y_train)/(len(df)+0.0)
print 'train ',pr1,'%\n\n', df1

df2 = pd.DataFrame(Y_test.value_counts())
df2['Percentage'] = 100*df2['ACTION']/len(Y_test)
pr2=100*len(Y_test)/(len(df)+0.0)
print '\n\ntest',pr2,'% \n\n',df2

train  74.9977112515 %

   ACTION  Percentage
1   23142   94.165039
0    1434    5.834961


test 25.0022887485 % 

   ACTION  Percentage
1    7730   94.348834
0     463    5.651166


In [343]:
#fonction d'apprentissage avec cross validation et gridsearch sur le premier dataset
def performance(X,Y):
    kf = StratifiedKFold(Y, n_folds=3, random_state=1)    
    gs = GridSearchCV(clf, params, scoring=metric, cv=kf)
    gs.fit(X,Y)
    return gs

#fonction de prédiction sur le dataset de validation
def validation(clf1, Xv, Yv):
    preds = clf1.predict_proba(Xv)[:,1]
    return roc_auc_score(Yv, preds)

### Les arbres de décision

In [344]:
#CHOIX DU MODELE A UTILISER
#Les arbres de décision


clf = DecisionTreeClassifier(random_state=1)
params = {'max_depth':[13],
          'min_samples_leaf' : [13]} #pour le gridSearch    
metric = 'roc_auc'

In [345]:
Modele_1 = performance(X_train,Y_train)
print "Resultat sur l'ensembles des données"

print "Resultat d'apprentissage : ",Modele_1.best_score_, Modele_1.best_params_

print "Resultat de validation : ",validation(Modele_1.best_estimator_, X_test, Y_test)

Resultat sur l'ensembles des données
Resultat d'apprentissage :  0.727143388698 {'max_depth': 13, 'min_samples_leaf': 13}
Resultat de validation :  0.758267416226


### Resultat sans "ROLE_TITLE"

In [7]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE']
Modele_2 = performance(X_train[var],Y_train)
print "Resultat sur l'ensembles des données"

print "Resultat d'apprentissage : ",Modele_2.best_score_, Modele_2.best_params_

print "Resultat de validation : ",validation(Modele_2.best_estimator_, X_test[var], Y_test)

Resultat sur l'ensembles des données
Resultat d'apprentissage :  0.717632569738 {'max_depth': 13, 'min_samples_leaf': 13}
Resultat de validation :  0.779400473318


### Resultat sans "ROLE_CODE"

In [346]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY']
Modele_3 = performance(X_train[var],Y_train)
print "Resultat sur l'ensembles des données"

print "Resultat d'apprentissage : ",Modele_3.best_score_, Modele_3.best_params_

print "Resultat de validation : ",validation(Modele_3.best_estimator_, X_test[var], Y_test)

Resultat sur l'ensembles des données
Resultat d'apprentissage :  0.726791956446 {'max_depth': 13, 'min_samples_leaf': 13}
Resultat de validation :  0.760928502175


##### Etape1:
    Resultat sur l'ensembles des données
    Resultat d'apprentissage :  0.71868130686 {'max_depth': 13, 'min_samples_leaf': 13}
    Resultat de validation :  0.779711035795
##### Etape2:    
    Resultat sur l'ensembles des données
    Resultat d'apprentissage :  0.732219923637 {'max_depth': 13, 'min_samples_leaf': 13}
    Resultat de validation :  0.770324728485

## <span style="color:#b36c8f">On supprime la colonne 'ROLE_CODE' et on garde 'ROLE_TITLE' <br/>(celle qui nous donne une meilleure score)</span>

In [347]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY']

<br/><br/><br/>

## RandomForestClassifier

In [348]:
modele_RF =RandomForestClassifier(n_estimators=1040, min_samples_split=9, n_jobs=4, random_state=42)
%time modele_RF.fit(X_train[var], Y_train)

Wall time: 17.2 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1040, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [349]:
print "Resultat d'apprentissage : ",validation(modele_RF, X_train[var], Y_train)
print "Resultat de validation : ",validation(modele_RF, X_test[var], Y_test)

Resultat d'apprentissage :  0.993991028285
Resultat de validation :  0.868803908365


###### Etape1: (n_estimators=1040, min_samples_split=9, n_jobs=4, random_state=42)
    Resultat d'apprentissage :  0.994720093891
    Resultat de validation :  0.868639336796
###### Etape2: (n_estimators=1040, min_samples_split=9, n_jobs=4, random_state=42)   
    Resultat d'apprentissage :  0.994073036677
    Resultat de validation :  0.867650230931

<br/><br/><br/>

## AdaBoostClassifier

In [24]:
modele_RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=9, n_jobs=4, random_state=42)
modele_ABC = AdaBoostClassifier(base_estimator = modele_RF, random_state=1, learning_rate=1.0)
%time modele_ABC.fit(X_train[var], Y_train)


Wall time: 4h 6min 10s


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1040, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=1)

In [25]:
%time print "Resultat d'apprentissage' : ",validation(modele_ABC, X_train[var], Y_train)

Resultat d'apprentissage' :  0.999929367014
Wall time: 1h 31min 55s


In [26]:
%time print "Resultat de validation : ",validation(modele_ABC, X_test[var], Y_test)

Resultat de validation :  0.864917616423
Wall time: 1h 33min 13s


##### Etape1 :
##### Resultat de validation  (modele_RFC ):
     0.870251104362      (0.999861415912)     
##### Resultat de validation  (modele_RF ):
     0.869831153482      (0.999853520928)
     
##### Etape2 :
##### Resultat de validation  (modele_RFC ):
     0.      (0.)
     
##### Resultat de validation  (modele_RF ):
     0.864917616423      (0.999929367014) 

<br/><br/><br/>

## ExtraTreesClassifier

In [350]:
modele_XT =ExtraTreesClassifier(n_estimators=1020, min_samples_split=8, n_jobs=4, random_state=5)
%time modele_XT.fit(X_train[var], Y_train)

Wall time: 11.3 s


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=1020, n_jobs=4,
           oob_score=False, random_state=5, verbose=0, warm_start=False)

In [351]:
print "Resultat d'apprentissage' : ",validation(modele_XT, X_train[var], Y_train)
print "Resultat de validation : ",validation(modele_XT, X_test[var], Y_test)

Resultat d'apprentissage' :  0.995024653443
Resultat de validation :  0.859670605394


#### Etape1 :   (n_estimators=1020, min_samples_split=8, n_jobs=4, random_state=5)
    Resultat d'apprentissage :  0.995549263073
    Resultat de validation :  0.860228164929
#### Etape2 :   (n_estimators=1020, min_samples_split=8, n_jobs=4, random_state=5)
    Resultat d'apprentissage :  0.995002128632
    Resultat de validation :  0.860286980405

<br/><br/><br/>

## GradientBoostingClassifier

In [352]:
modele_GB =GradientBoostingClassifier(n_estimators=90, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=1)
%time modele_GB.fit(X_train[var], Y_train)

Wall time: 1min 19s


GradientBoostingClassifier(init=None, learning_rate=0.2, loss='deviance',
              max_depth=20, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=9,
              min_weight_fraction_leaf=0.0, n_estimators=90,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [353]:
print "Resultat d'apprentissage' : ",validation(modele_GB, X_train[var], Y_train)
print "Resultat de validation : ",validation(modele_GB, X_test[var], Y_test)

Resultat d'apprentissage' :  0.999927302867
Resultat de validation :  0.849010195614


##### Etape1 :   (n_estimators=90, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=1)
    Resultat d'apprentissage' :  1.0
    Resultat de validation :  0.84932173602
##### Etape2 :   (n_estimators=90, learning_rate=0.20, max_depth=20, min_samples_split=9, random_state=1)
    Resultat d'apprentissage' :  0.999930331287
    Resultat de validation :  0.850811681508

<br/><br/><br/><br/><h4><span style="color: #c36b0f;">Selection des variables : Random Forest</span></h4>

In [102]:
#Approche integree
def random_forest_selection(X,Y,n_features):
    
    params = {'max_depth':[13],
              'min_samples_leaf' : [13]}
    metric = 'roc_auc'
    
    clf = RandomForestClassifier(random_state=1, n_jobs=4)
    gs = GridSearchCV(clf, params, scoring=metric)
    gs.fit(X,Y)
    bestClf = gs.best_estimator_
    rf = bestClf.feature_importances_
    
    rf=zip(rf,X.columns)
    rf=sorted(rf,reverse=True)[:n_features]
 
    return [x[1] for x in rf]

rfFeatures = random_forest_selection(X_train[var], Y_train,6)

print rfFeatures

['MGR_ID', 'ROLE_FAMILY_DESC', 'ROLE_DEPTNAME', 'ROLE_ROLLUP_2', 'ROLE_TITLE', 'RESOURCE']


<h3><span style="color: #c36b0f;">LogisticRegression</span></h3>

In [286]:
from sklearn.preprocessing import OneHotEncoder 
from scipy import sparse

#good_features=range(0,15283)
Xt=OneHotEncoder().fit_transform(X[['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_ROLLUP_2', 'RESOURCE', 'ROLE_TITLE']].astype(str))
print "Xt :", Xt.shape
#Xts=sparse.hstack([Xt[:,j] for j in good_features]).tocsr()
#print "Xts :", Xts.shape
X_train_sparce, X_test_sparce, Y_train_sparce, Y_test_sparce = train_test_split(Xt, Y, test_size=0.25,random_state=1)


Xt : (32769, 10479)


In [287]:
modele_LR = linear_model.LogisticRegression(C=2.601, n_jobs=4,intercept_scaling=2.11, random_state=1)
modele_LR.fit(X_train_sparce, Y_train_sparce)

LogisticRegression(C=2.601, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=2.11, max_iter=100, multi_class='ovr',
          n_jobs=4, penalty='l2', random_state=1, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [288]:
print "Resultat de validation : ",validation(modele_LR, X_test_sparce, Y_test_sparce)
print "Resultat de validation :  0.859862978103"

Resultat de validation :  0.859862978103
Resultat de validation :  0.859862978103


In [289]:
print "score E1",0.858838666775
print "score E2",0.859513577853

score E1 0.858838666775
score E2 0.859513577853


<h3><span style="color: #c36b0f;">Naive Bayes</span></h3>

In [290]:
modele_NB = naive_bayes.BernoulliNB(alpha=0.041)
%time modele_NB.fit(X_train_sparce, Y_train_sparce)

Wall time: 16 ms


BernoulliNB(alpha=0.041, binarize=0.0, class_prior=None, fit_prior=True)

In [291]:
print "Resultat de validation : ",validation(modele_NB, X_test_sparce, Y_test_sparce)
print "Resultat de validation :  0.839242216379"

Resultat de validation :  0.839242216379
Resultat de validation :  0.839242216379


In [292]:
print "score E1",0.83560110534
print "score E2",0.837708683176

score E1 0.83560110534
score E2 0.837708683176


# Sparse Soft Voting

In [293]:
from sklearn.ensemble import VotingClassifier

sclf = VotingClassifier(estimators=[('NB', modele_NB), ('LR', modele_LR)], voting='soft', 
                        weights=[1,9.11])

%time sclf = sclf.fit(X_train_sparce, Y_train_sparce)

Wall time: 759 ms


In [294]:
print "Resultat de validation = ",validation(sclf, X_test_sparce, Y_test_sparce)
print "Resultat de validation = ",0.861791175723,"weights=[1,9.11]"

Resultat de validation =  0.861791175723
Resultat de validation =  0.861791175723 weights=[1,9.11]


In [None]:
n_models=2

In [None]:
def fopt_pred(pars, data):
    return np.dot(data, pars)

def fopt(pars):
    fpr, tpr, thresholds = metrics.roc_curve(y_train, fopt_pred(pars, B_train))
    return -metrics.auc(fpr, tpr)

x0 = np.ones((n_models, 1)) / n_models
xopt = minimize(fopt, x0)
preds = fopt_pred(xopt, B_test)

# Soft VotingClassifier

In [None]:
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers

#, ('LR', modele_LR), ('NB', modele_NB)

eclf = VotingClassifier(estimators=[('DT', Modele_3.best_estimator_), ('RF', modele_RF), ('ABC', modele_ABC), ('XT', modele_XT), ('GB', modele_GB)], voting='soft', 
                        weights=[1,1,1,1,1])

%time eclf = eclf.fit(X_train[var], Y_train)

In [None]:
print "Resultat de validation = ",validation(eclf, X_test[var], Y_test)

In [None]:
print "E1 weights=[1,4,5,3,2]"

# Submit

In [332]:
from itertools import product

test = pd.read_csv('test.csv', index_col=0)

clos=['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_ROLLUP_2', 'RESOURCE', 'ROLE_TITLE']

In [333]:
Xtest=OneHotEncoder().fit_transform(test[clos].astype(str))


In [334]:
Xtest.shape

(58921, 13403)

In [335]:
preds = sclf.predict_proba(Xtest)[:,1]

ValueError: Expected input with 10479 features, got 13403 instead

In [336]:
submissions = pd.DataFrame(data=preds, columns=["ACTION"], index = test.index)

NameError: name 'preds' is not defined

In [337]:
submissions.to_csv("sampleSubmission.csv")

NameError: name 'submissions' is not defined

In [109]:
#Approche filtre
def variance_selection(X,n_features):
    
    variance = [np.var(X.getcol(i).todense()) for i in range(X.shape[1])]
    variance = sorted( zip(variance,range(X.shape[1])) ,reverse=True)[:n_features]   
        
    return [x[1] for x in variance]

for n_features in range(1000,X.shape[1],100):
    print n_features
    good_features = variance_selection(X_train_sparce,n_features)
    print 'Selected features : ', good_features
    Xts=sparse.hstack([X_train_sparce[:,j] for j in good_features]).tocsr()
    gs = performance(Xts, Y_train_sparce)
    print gs.best_score_, gs.best_params_
    
    print '***************************\n'