In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics

In [2]:
#Chargement du fichier train.csv dans le dataframe df, n'oubliez pas de modifier le chemin 
#pour tenir compte de l'endroit ou se trouve votre fichier
df = pd.read_csv("train.csv")

In [3]:
df.head(20)

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
0,1,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,1,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,1,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,1,42680,5905,117929,117930,119569,119323,123932,19793,119325
5,0,45333,14561,117951,117952,118008,118568,118568,19721,118570
6,1,25993,17227,117961,118343,123476,118980,301534,118295,118982
7,1,19666,4209,117961,117969,118910,126820,269034,118638,126822
8,1,31246,783,117961,118413,120584,128230,302830,4673,128231
9,1,78766,56683,118079,118080,117878,117879,304519,19721,117880


In [4]:
# happy customers have TARGET==0, unhappy custormers have TARGET==1
# A little less then 4% are unhappy => unbalanced dataset
data = pd.DataFrame(df.ACTION.value_counts())
data['Percentage'] = 100*data['ACTION']/df.shape[0]
data

Unnamed: 0,ACTION,Percentage
1,30872,94.210992
0,1897,5.789008


<h2><span style="color: #3366ff;">Nettoyage des données</span></h2>

<h3><span style="color: #03660f;"> Identifier les colonnes inutiles</span></h3>

In [5]:
#Identifier les colonnes inutiles (dont l'écart type est null)

Bruit=[]
for i in xrange(len(df.columns.values)):
    #pour chaque colonne "df[x]"
    if np.std(df[df.columns.values[i]]) == 0.0:
        #on recupère la liste des noms des colonnes "df.columns.values" et on la parcours
        Bruit.append(df.columns.values[i])
print 'Le nombre des colonnes inutiles est : ',len(Bruit)

#Les colonnes dont std = 0 , on va donc les supprimer parcequ'il est 
#n'apportera pas grand chose pour l'apprentissage (c'est que du bruit)

for i in xrange(len(Bruit)):
    df.drop(Bruit[i], axis=1, inplace=True)

print "done"

Le nombre des colonnes inutiles est :  0
done


In [6]:
def resemblance(column1 , column2):
    a = np.array(df[column1])
    b = np.array(df[column2])
    return ((a==b).sum() * 1.0)/len(a)

In [7]:
#la liste des colonnes
Liste=df.columns.values[1:]
seuil=1.0

resemble=[]
jump = []
for i in xrange(len(Liste)):
    if i not in jump:
        for j in xrange(len(Liste)):
            if i!=j and resemblance(Liste[i], Liste[j]) >= seuil:
                if j not in jump:
                    #resemble.append(Liste[j])
                    jump.append(j)
    else:
        tempo=[]
        for j in xrange(len(Liste)):
            if i!=j and resemblance(Liste[i], Liste[j]) >= seuil:
                if j not in jump:
                    tempo.append(j)
        if len(tempo) >= 2:
            #resemble.extend(tempo[:-1])
            jump.extend(tempo[:-1])
print "Nombre des colonnes qui se ressemblent : ", len(jump)

def double(jump):
    proc = []
    doublons = []
    for r in jump:
        if r not in proc:
            proc.append(r)
        else:
            doublons.append(r)
    return doublons

#Delete les colonnes dont les valeurs se ressemblent
if len(double(jump))==0:
    for i in xrange(len(jump)):
        df.drop([Liste[jump[i]]], axis=1, inplace=True)
        resemble.append(Liste[jump[i]])
    print "..... droped"
else:
    print "ERROR..."


print "done"

Nombre des colonnes qui se ressemblent :  0
..... droped
done


In [8]:
import seaborn as sns
cor_mat = df.iloc[1:,:].corr()

# only important correlations and not auto-correlations
threshold = 0.1
important_corrs = (cor_mat[abs(cor_mat) > threshold][cor_mat != 1.0]) \
    .unstack().dropna().to_dict()
unique_important_corrs = pd.DataFrame(
    list(set([(tuple(sorted(key)), important_corrs[key]) \
    for key in important_corrs])), columns=['attribute pair', 'correlation'])
# sorted by absolute value
unique_important_corrs = unique_important_corrs.ix[
    abs(unique_important_corrs['correlation']).argsort()[::-1]]
unique_important_corrs

Unnamed: 0,attribute pair,correlation
1,"(ROLE_FAMILY, ROLE_FAMILY_DESC)",-0.180576
3,"(ROLE_FAMILY_DESC, ROLE_TITLE)",0.170687
2,"(ROLE_CODE, ROLE_TITLE)",0.155918
0,"(ROLE_CODE, ROLE_FAMILY)",-0.148617
4,"(MGR_ID, ROLE_FAMILY)",-0.118315


In [9]:
print " Le nombre des variables restante est: ",len(df.columns.values)

 Le nombre des variables restante est:  10


<br/><br/><br/>
##### cor_mat = df.iloc[:,:-1].corr()

f, ax = plt.subplots(figsize=(15, 12))
##### Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(cor_mat,linewidths=.5, ax=ax);

##### only important correlations and not auto-correlations
threshold = 0.95
important_corrs = (cor_mat[abs(cor_mat) > threshold][cor_mat != 1.0]) \
    .unstack().dropna().to_dict()
unique_important_corrs = pd.DataFrame(
    list(set([(tuple(sorted(key)), important_corrs[key]) \
    for key in important_corrs])), columns=['attribute pair', 'correlation'])
##### sorted by absolute value
unique_important_corrs = unique_important_corrs.ix[
    abs(unique_important_corrs['correlation']).argsort()[::-1]]
unique_important_corrs

<br/><br/><br/>

In [10]:
def matric_relation(df, q):
    if q<=1 and q>=0 :
        matric = pd.DataFrame(index=[x for x in df.columns[1:].values], columns=[x for x in df.columns[1:].values])
        #les nom des colomnes de Data Frame (sans target)
        Col_Name = df.columns[1:].values

        for i in Col_Name:
            for j in Col_Name:
                if i!=j:
                    matric.loc[j,i] = df.groupby(j)[i].apply(lambda x: len(np.unique(x))).quantile(q)
        return matric
    
    else:
        print "ERREUR : Entrer une valeur comprise entre 0 ≤ q ≤ 1"


In [11]:
matric_relation(df,1.0)

Unnamed: 0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
RESOURCE,,587.0,53.0,68.0,193.0,121.0,246.0,36.0,121.0
MGR_ID,97.0,,7.0,7.0,8.0,8.0,11.0,5.0,8.0
ROLE_ROLLUP_1,5609.0,1745.0,,16.0,330.0,241.0,1146.0,61.0,241.0
ROLE_ROLLUP_2,1482.0,348.0,9.0,,76.0,104.0,265.0,37.0,104.0
ROLE_DEPTNAME,332.0,304.0,40.0,45.0,,46.0,136.0,17.0,46.0
ROLE_TITLE,2375.0,592.0,55.0,65.0,163.0,,153.0,1.0,1.0
ROLE_FAMILY_DESC,3128.0,705.0,47.0,57.0,171.0,19.0,,8.0,19.0
ROLE_FAMILY,4085.0,946.0,87.0,113.0,193.0,26.0,291.0,,26.0
ROLE_CODE,2375.0,592.0,55.0,65.0,163.0,1.0,153.0,1.0,


In [12]:
def Columns_equal(df, quanti):
    duplicate = pd.DataFrame(columns=["Column 1","Column 2","quantile"])
    for q in quanti :
        print q
        if q<=1 and q>=0 :
            mat = matric_relation(df, q)
            #les nom des colomnes de Data Frame (sans target)
            Col_Name = df.columns[1:].values
            for i in Col_Name:
                for j in Col_Name:
                    if i!=j and mat[i][j]==1 and mat[j][i]==1:

                        if len(duplicate)==0:
                            duplicate.loc[len(duplicate)] = [i,j,q]
                        else:
                            drp=0
                            for k in xrange(len(duplicate)):
                                if duplicate.ix[k][0]==j and duplicate.ix[k][1]==i :
                                    drp=1
                                if duplicate.ix[k][0]==i and duplicate.ix[k][1]==j :
                                    drp=1
                                    if duplicate.ix[k][2] < q:
                                        duplicate.set_value(k,2,q,takeable=True)
                                        
                            if drp==0:
                                duplicate.loc[len(duplicate)] = [i,j,q]

        else:
            print "ERREUR : Entrer une valeur comprise entre 0 ≤ q ≤ 1"
    return duplicate

In [15]:
tt=Columns_equal(df, [1.0, 0.99, 0.98, 0.7])

1.0
0.99
0.98
0.7


In [16]:
tt

Unnamed: 0,Column 1,Column 2,quantile
0,ROLE_TITLE,ROLE_CODE,1.0
1,ROLE_ROLLUP_1,ROLE_ROLLUP_2,0.7


## <span style="color:#b36c8f">On supprime la colonne 'ROLE_CODE' et on garde 'ROLE_TITLE' <br/>(celle qui nous donne une meilleure score)</span>

In [13]:
var=['RESOURCE', 'MGR_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2','ROLE_DEPTNAME', 'ROLE_TITLE', 'ROLE_FAMILY_DESC', 'ROLE_FAMILY']

In [14]:
print "Count of unique values of each column in train set"
print df.apply(lambda x: len(x.unique()))
print

Count of unique values of each column in train set
ACTION                 2
RESOURCE            7518
MGR_ID              4243
ROLE_ROLLUP_1        128
ROLE_ROLLUP_2        177
ROLE_DEPTNAME        449
ROLE_TITLE           343
ROLE_FAMILY_DESC    2358
ROLE_FAMILY           67
ROLE_CODE            343
dtype: int64



In [15]:
#regroupe les modalités des variables rares, pour un seuil donné 
def regroupe(df, column, seuil):
    
    #Valeurs les plus communes
    ss = pd.DataFrame(data=df[column].value_counts())
    selected = ss[column][ss[column]==1]
    column_num = df.columns.get_loc(column)
    for x in np.array(selected.index) :
        for line_num in df[column][df[column]==x].index.values:
            df.set_value(line_num, column_num, np.array(selected.index)[0], takeable=True)
        

In [188]:
regroupe(df, "RESOURCE", 1)

<br/><br/><br/>
<h5> Les arbres de décision</h5>
<h5> AdaBoostClassifier</h5>
<p>An AdaBoost classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.</p>
<h5> RandomForestClassifier</h5>
<h5> ExtraTreesClassifier</h5>
<p>In practice, RFs are often more compact than ETs. ETs are generally cheaper to train from a computational point of view but <b>can grow much bigger</b>.</p>
<h5> GradientBoostingClassifier :</h5>
<p>he way GBMs work is by starting with a rough prediction and then building a <b>series of decision trees</b>, with each tree in the series trying to correct the prediction error of the tree before it.</p><br/>
<h5 style="color: #c36b0f;"> LogisticRegression</h5>
<h5 style="color: #c36b0f;"> Naive Bayes</h5>
<br/><br/><br/>

## Echantillonnage de données

In [16]:
Y = df.ACTION
X = df.drop(['ACTION'], axis=1)

# diviser X et Y en training and testing
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=1)

df1 = pd.DataFrame(Y_train.value_counts())
df1['Percentage'] = 100*df1['ACTION']/len(Y_train)
pr1=100*len(Y_train)/(len(df)+0.0)
print 'train ',pr1,'%\n\n', df1

df2 = pd.DataFrame(Y_test.value_counts())
df2['Percentage'] = 100*df2['ACTION']/len(Y_test)
pr2=100*len(Y_test)/(len(df)+0.0)
print '\n\ntest',pr2,'% \n\n',df2

train  74.9977112515 %

   ACTION  Percentage
1   23142   94.165039
0    1434    5.834961


test 25.0022887485 % 

   ACTION  Percentage
1    7730   94.348834
0     463    5.651166


In [257]:
from sklearn.preprocessing import OneHotEncoder 
data_sparce=OneHotEncoder().fit_transform(df[var].astype(str))
data_sparce
print type(data_sparce)

<32769x15283 sparse matrix of type '<type 'numpy.float64'>'
	with 262152 stored elements in Compressed Sparse Row format>

<h3><span style="color: #3366ff;">CHOIX DU MODELE A UTILISER</span></h3>

In [13]:
#fonction de prédiction sur le dataset de validation
def validation(clf1, Xv, Yv):
    preds = clf1.predict_proba(Xv)[:,1]
    return roc_auc_score(Yv, preds)

## Arbres de décision

In [254]:
#CHOIX DU MODELE A UTILISER
#Les arbres de décision


clf = DecisionTreeClassifier(random_state=1)
params = {'max_depth':range(1,14),
          'min_samples_leaf' : range(1,14)} #pour le gridSearch    
metric = 'roc_auc'


In [255]:
#fonction d'apprentissage avec cross validation et gridsearch sur le premier dataset
def performance(X,Y):
    kf = StratifiedKFold(Y, n_folds=3, random_state=1)    
    gs = GridSearchCV(clf, params, scoring=metric, cv=kf)
    gs.fit(X,Y)
    return gs

#fonction de prédiction sur le dataset de validation
def validation(clf1, Xv, Yv):
    preds = clf1.predict_proba(Xv)[:,1]
    return roc_auc_score(Yv, preds)

gs = performance(X_train,Y_train)
print "Resultat d'apprentissage : ",gs.best_score_, gs.best_params_

print "Resultat de validation : ",validation(gs.best_estimator_, X_test, Y_test)

Resultat d'apprentissage :  0.716071984535 {'max_depth': 13, 'min_samples_leaf': 13}
Resultat de validation :  0.779412627585


Resultat d'apprentissage :  0.746896548108 {'max_depth': 20, 'min_samples_leaf': 32}<br/>
Resultat de validation :  0.775431336774<br/><br/>
Resultat d'apprentissage :  0.716071984535 {'max_depth': 13, 'min_samples_leaf': 13}<br/>
Resultat de validation :  0.779412627585