In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics, linear_model, naive_bayes

In [2]:
#Chargement du fichier dataProjet.csv dans le dataframe df, n'oubliez pas de modifier le chemin 
#pour tenir compte de l'endroit ou se trouve votre fichier
df = pd.read_csv("train.csv")

In [3]:
print "Count of unique values of each column in train set"
print df.apply(lambda x: len(x.unique()))
print

Count of unique values of each column in train set
ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64



In [72]:
df.RESOURCE.count()*2.5/100 <= np.array(df.MGR_ID.value_counts()==2 ,dtype=int).sum()

False

In [268]:
np.array(df.ROLE_FAMILY_DESC.value_counts()==2 ,dtype=int).sum()

371

In [4]:
#regroupe les modalités des variables rares, pour un seuil donné 
def regroupe(df, column, seuil):
    if (df[column].count()*2.5/100) <= (np.array(df[column].value_counts()==seuil ,dtype=int).sum()):
        #Valeurs les plus communes
        ss = pd.DataFrame(data=df[column].value_counts())
        selected = ss[column][ss[column]==1]
        column_num = df.columns.get_loc(column)
        for x in np.array(selected.index) :
            for line_num in df[column][df[column]==x].index.values:
                df.set_value(line_num, column_num, np.array(selected.index)[0], takeable=True)
        print column,"seuil=",seuil

In [5]:
for col in df.columns.values:
    for i in [1,2,3]:
        regroupe(df, col, i)
        

RESOURCE seuil= 1
RESOURCE seuil= 2
MGR_ID seuil= 1


## Echantillonnage de données

In [6]:
Y = df.ACTION
X = df.drop(['ACTION'], axis=1)

# diviser X et Y en training and testing
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=1)

df1 = pd.DataFrame(Y_train.value_counts())
df1['Percentage'] = 100*df1['ACTION']/len(Y_train)
pr1=100*len(Y_train)/(len(df)+0.0)
print 'train ',pr1,'%\n\n', df1

df2 = pd.DataFrame(Y_test.value_counts())
df2['Percentage'] = 100*df2['ACTION']/len(Y_test)
pr2=100*len(Y_test)/(len(df)+0.0)
print '\n\ntest',pr2,'% \n\n',df2

train  74.9977112515 %

   ACTION  Percentage
1   23142   94.165039
0    1434    5.834961


test 25.0022887485 % 

   ACTION  Percentage
1    7730   94.348834
0     463    5.651166


In [7]:
#fonction d'apprentissage avec cross validation et gridsearch sur le premier dataset
def performance(X,Y):
    kf = StratifiedKFold(Y, n_folds=3, random_state=1)    
    gs = GridSearchCV(clf, params, scoring=metric, cv=kf)
    gs.fit(X,Y)
    return gs

#fonction de prédiction sur le dataset de validation
def validation(clf1, Xv, Yv):
    preds = clf1.predict_proba(Xv)[:,1]
    return roc_auc_score(Yv, preds)

### Les arbres de décision

In [78]:
#CHOIX DU MODELE A UTILISER
#Les arbres de décision


clf = DecisionTreeClassifier(random_state=1)
params = {'max_depth':range(1,15),
          'min_samples_leaf' : range(1,15)} #pour le gridSearch    
metric = 'roc_auc'

In [79]:
Modele_1 = performance(X_train,Y_train)
print "Resultat sur l'ensembles des données"

print "Resultat d'apprentissage : ",Modele_1.best_score_, Modele_1.best_params_

print "Resultat de validation : ",validation(Modele_1.best_estimator_, X_test, Y_test)

Resultat sur l'ensembles des données
Resultat d'apprentissage :  0.728810104181 {'max_depth': 14, 'min_samples_leaf': 14}
Resultat de validation :  0.762003246726


### Resultat sans "ROLE_TITLE"

In [7]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY', 'ROLE_CODE']
Modele_2 = performance(X_train[var],Y_train)
print "Resultat sur l'ensembles des données"

print "Resultat d'apprentissage : ",Modele_2.best_score_, Modele_2.best_params_

print "Resultat de validation : ",validation(Modele_2.best_estimator_, X_test[var], Y_test)

Resultat sur l'ensembles des données
Resultat d'apprentissage :  0.717632569738 {'max_depth': 13, 'min_samples_leaf': 13}
Resultat de validation :  0.779400473318


### Resultat sans "ROLE_CODE"

In [9]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY']
Modele_3 = performance(X_train[var],Y_train)
print "Resultat sur l'ensembles des données"

print "Resultat d'apprentissage : ",Modele_3.best_score_, Modele_3.best_params_

print "Resultat de validation : ",validation(Modele_3.best_estimator_, X_test[var], Y_test)

Resultat sur l'ensembles des données
Resultat d'apprentissage :  0.732219923637 {'max_depth': 13, 'min_samples_leaf': 13}
Resultat de validation :  0.770324728485


## <span style="color:#b36c8f">On supprime la colonne 'ROLE_CODE' et on garde 'ROLE_TITLE' <br/>(celle qui nous donne une meilleure score)</span>

In [249]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY']

<br/><br/><br/>

## RandomForestClassifier

In [11]:
modele_RF =RandomForestClassifier(n_estimators=1040, min_samples_split=9, n_jobs=2, random_state=42)
%time modele_RF.fit(X_train[var], Y_train)

CPU times: user 31.1 s, sys: 404 ms, total: 31.5 s
Wall time: 17.1 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1040, n_jobs=2,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
print "Resultat d'apprentissage : ",validation(modele_RF, X_train[var], Y_train)
print "Resultat de validation : ",validation(modele_RF, X_test[var], Y_test)

Resultat d'apprentissage :  0.994073036677
Resultat de validation :  0.867650230931


<br/><br/><br/>

## AdaBoostClassifier

In [13]:
modele_RFC = RandomForestClassifier(n_estimators=1000, min_samples_split=9, n_jobs=2, random_state=42)
modele_ABC = AdaBoostClassifier(base_estimator = modele_RFC, random_state=1, learning_rate=1.0)
%time modele_ABC.fit(X_train[var], Y_train)


CPU times: user 29min 16s, sys: 1min 9s, total: 30min 26s
Wall time: 16min 55s


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=9,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=2,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=1)

In [14]:
%time print "Resultat d'apprentissage' : ",validation(modele_ABC, X_train[var], Y_train)

Resultat d'apprentissage' :  0.99992924648
CPU times: user 4min 42s, sys: 1min 11s, total: 5min 54s
Wall time: 4min 25s


In [15]:
%time print "Resultat de validation : ",validation(modele_ABC, X_test[var], Y_test)

Resultat de validation :  0.864499481697
CPU times: user 2min 9s, sys: 53.1 s, total: 3min 2s
Wall time: 2min 44s


#### Resultat de validation  ( n_estimators ):
     2000 ==> 0.776515441507
     3000 ==> 0.783840133669
     5000 ==> 0.797278561829
    10000 ==> 0.8089289157
    20000 ==> 0.818141151554
    30000 ==> 0.82117804185       (0.946997567742)
    100000 ==> 0.82720110981      (0.963220976261)
    
#### Resultat de validation  (modele_RF ,  n_estimators ):
     1000 ==> 0.870251104362      (0.999861415912)
     

In [78]:
RandomForestClassifier?

<br/><br/><br/>

## ExtraTreesClassifier

In [16]:
modele_XT =ExtraTreesClassifier(n_estimators=1020, min_samples_split=8, n_jobs=2, random_state=5)
%time modele_XT.fit(X_train[var], Y_train)

CPU times: user 22.2 s, sys: 1.1 s, total: 23.3 s
Wall time: 14.2 s


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=8,
           min_weight_fraction_leaf=0.0, n_estimators=1020, n_jobs=2,
           oob_score=False, random_state=5, verbose=0, warm_start=False)

In [255]:
print "Resultat d'apprentissage' : ",validation(modele_XT, X_train[var], Y_train)
print "Resultat de validation : ",validation(modele_XT, X_test[var], Y_test)

Resultat d'apprentissage' : 

NameError: name 'modele_XT' is not defined

In [31]:
(n_estimators=1020, min_samples_split=8, n_jobs=2, random_state=5)
Resultat d'apprentissage' :  0.995549263073
Resultat de validation :  0.860228164929

<br/><br/><br/>

## GradientBoostingClassifier

In [265]:
modele_GB =GradientBoostingClassifier(n_estimators=90, learning_rate=0.10, max_depth=20, min_samples_split=9, random_state=1)
%time modele_GB.fit(X_train[var], Y_train)

CPU times: user 54.3 s, sys: 140 ms, total: 54.5 s
Wall time: 54.8 s


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=20, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=9,
              min_weight_fraction_leaf=0.0, n_estimators=90,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [266]:
print "Resultat d'apprentissage' : ",validation(modele_GB, X_train[var], Y_train)
print "Resultat de validation : ",validation(modele_GB, X_test[var], Y_test)

Resultat d'apprentissage' :  0.999927302867
Resultat de validation :  0.847177555679


    Resultat d'apprentissage' :  0.999927302867
    Resultat de validation :  0.851872455637



<br/><br/><br/><br/><h4><span style="color: #c36b0f;">Selection des variables : Random Forest</span></h4>

In [12]:
#Approche integree
def random_forest_selection(X,Y,n_features):
    
    params = {'max_depth':[13],
              'min_samples_leaf' : [13]}
    metric = 'roc_auc'
    
    clf = RandomForestClassifier(random_state=1, n_jobs=3)
    gs = GridSearchCV(clf, params, scoring=metric)
    gs.fit(X,Y)
    bestClf = gs.best_estimator_
    rf = bestClf.feature_importances_
    
    rf=zip(rf,X.columns)
    rf=sorted(rf,reverse=True)[:n_features]
 
    return [x[1] for x in rf]

rfFeatures = random_forest_selection(X_train[var], Y_train,8)

print rfFeatures

['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_ROLLUP_2', 'RESOURCE', 'ROLE_TITLE', 'ROLE_FAMILY', 'ROLE_ROLLUP_1']


<h3><span style="color: #c36b0f;">LogisticRegression</span></h3>

In [8]:
from sklearn.preprocessing import OneHotEncoder 
from scipy import sparse

#good_features=range(0,15283)
Xt=OneHotEncoder().fit_transform(X[['MGR_ID', 'ROLE_DEPTNAME', 'ROLE_FAMILY_DESC', 'ROLE_ROLLUP_2', 'RESOURCE', 'ROLE_TITLE']].astype(str))
print "Xt :", Xt.shape
#Xts=sparse.hstack([Xt[:,j] for j in good_features]).tocsr()
#print "Xts :", Xts.shape
X_train_sparce, X_test_sparce, Y_train_sparce, Y_test_sparce = train_test_split(Xt, Y, test_size=0.25,random_state=1)


Xt : (32769, 10479)


In [9]:
modele_LR = linear_model.LogisticRegression(C=2.601, n_jobs=2, intercept_scaling=2.11, random_state=1)
modele_LR.fit(X_train_sparce, Y_train_sparce)

LogisticRegression(C=2.601, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=2.11, max_iter=100, multi_class='ovr',
          n_jobs=2, penalty='l2', random_state=1, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
print "Resultat de validation : ",validation(modele_LR, X_test_sparce, Y_test_sparce)
print "Resultat de validation : ",0.859862978103, "C=2.601" ," intercept_scaling=2.11"

Resultat de validation :  0.859862978103
Resultat de validation :  0.859862978103 C=2.601  intercept_scaling=2.11


In [11]:
print "score ",0.858838666775

score  0.858838666775


In [25]:
from sklearn.cross_validation import cross_val_score

def getScoreClf(DF , Y , clf ):
    #clf = clone(clf)
    return max(cross_val_score(clf , DF , Y , scoring='roc_auc' ,  cv = 5 , n_jobs=-1))

In [26]:
print getScoreClf(X_test_sparce,Y_test_sparce,modele_LR)

0.841066004837


<h3><span style="color: #c36b0f;">Naive Bayes</span></h3>

In [13]:
modele_NB = naive_bayes.BernoulliNB(alpha=0.041)
%time modele_NB.fit(X_train_sparce, Y_train_sparce)

CPU times: user 12.1 ms, sys: 2.74 ms, total: 14.8 ms
Wall time: 14.1 ms


BernoulliNB(alpha=0.041, binarize=0.0, class_prior=None, fit_prior=True)

In [14]:
print "Resultat de validation : ",validation(modele_NB, X_test_sparce, Y_test_sparce)
print "Resultat de validation : ",0.839242216379, "  (alpha=0.041, binarize=0.0, class_prior=None, fit_prior=True)"

Resultat de validation :  0.839242216379
Resultat de validation :  0.839242216379   (alpha=0.041, binarize=0.0, class_prior=None, fit_prior=True)


# Soft VotingClassifier

In [None]:
from itertools import product
from sklearn.ensemble import VotingClassifier

# Training classifiers

#, ('LR', modele_LR), ('NB', modele_NB)

eclf = VotingClassifier(estimators=[('DT', Modele_3.best_estimator_), ('RF', modele_RF), ('ABC', modele_ABC), ('XT', modele_XT), ('GB', modele_GB)], voting='soft', 
                        weights=[1,4,5,3,2])

%time eclf = eclf.fit(X_train[var], Y_train)

In [None]:
print "Resultat de validation = ",validation(eclf, X_test[var], Y_test)

In [4]:
#regroupe les modalités des variables rares, pour un seuil donné 
def regroupe(df, column, seuil):
    
    #Valeurs les plus communes
    ss = pd.DataFrame(data=df[column].value_counts())
    selected = ss[column][ss[column]==1]
    column_num = df.columns.get_loc(column)
    for x in np.array(selected.index) :
        for line_num in df[column][df[column]==x].index.values:
            df.set_value(line_num, column_num, np.array(selected.index)[0], takeable=True)
        

In [5]:
regroupe(df, "RESOURCE", 1)

# Submit

In [233]:
test = pd.read_csv('test.csv', index_col=0)

In [234]:
preds = eclf.predict_proba(test[var])[:,1]

In [235]:
submissions = pd.DataFrame(data=preds, columns=["ACTION"], index = test.index)

In [236]:
submissions.to_csv("sampleSubmission.csv")

In [109]:
#Approche filtre
def variance_selection(X,n_features):
    
    variance = [np.var(X.getcol(i).todense()) for i in range(X.shape[1])]
    variance = sorted( zip(variance,range(X.shape[1])) ,reverse=True)[:n_features]   
        
    return [x[1] for x in variance]

for n_features in range(1000,X.shape[1],100):
    print n_features
    good_features = variance_selection(X_train_sparce,n_features)
    print 'Selected features : ', good_features
    Xts=sparse.hstack([X_train_sparce[:,j] for j in good_features]).tocsr()
    gs = performance(Xts, Y_train_sparce)
    print gs.best_score_, gs.best_params_
    
    print '***************************\n'