In [1]:
%matplotlib inline

In [49]:
import datetime
import pandas as pd
import numpy as np

In [50]:
import sklearn.preprocessing
import sklearn.ensemble
from sklearn.pipeline import Pipeline
import sklearn.model_selection
import sklearn_pandas
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection , metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search
import scipy.sparse

In [65]:
train = pd.read_pickle('../data/merged_data/train.pkl')
#test = pd.read_pickle('../data/merged_data/test.pkl')

In [52]:
#train = train.sample(frac=0.5)

In [53]:
# preprocess
y_train = train['target'].values

# to try:
* h2O random forest: https://github.com/roamanalytics/roamresearch/blob/master/BlogPosts/Categorical_variables_in_tree_models/tree_categorical_variables.py
* optimize aucroc with keras and batch methods
* logistic L2, L1

In [54]:
drop = ['INSTANCE_ID', #400k modalities
        'INCIDENT_NUMBER']
drop_atm = ['AUTEUR_INCIDENT', # 2088 modalities
            'TYPE_VOIE',
            'NATURE_CODE', # 313 modalities, need to be splitted in 5 modalities
#            'MARQUE_LIB', # 167 modalities
#            'OPTION', # 80 modalities, extract options
            'MODELE_CODE', # 10k modalities --> sparse ?
            'COMMENTAIRE_BI', # NLP
#             'RESOURCE_ID', # 4033 modalities
            'CODE_POSTAL', # 5800 modalities (only get first 2 numbers ?)
            'L2_ORGA_CODE_POSTAL', # 147 modalities (might be redondent with L2_ORGANISATION_ID)
#            'L2_ORGANISATION_ID' #151 modalities
            'L2_ORGA_VILLE', # 146, might be redondent with other organisation variables
            'RACHAT_CODE' # 312 modalities (try binarising ?)         
#            'CODE_INSTALLATION' # 17 modalities
           ]

In [55]:
train.drop(drop + drop_atm + ['target'], axis=1, inplace=True)
#test.drop(drop + drop_atm, axis=1, inplace=True)

In [56]:
categoricals = train.columns[train.dtypes == 'category']
quantitative = ['NB_PASSAGE', 'POINTS_FIDEL', 'CONTRAT_TARIF', 'PRIX_FACTURE']
dates = train.columns[train.dtypes == 'datetime64[ns]']

### imputation of missing data

TODO: try imputing test based on test values, not train <br>
TODO: try diffrent strategy on imputing datas from contract since missing are present only in test set

In [57]:
replace = train[categoricals].mode()
replace_values = {k:v.iloc[0] for k,v in replace.items()}

In [58]:
train.fillna(replace_values, inplace=True)
#test.fillna(replace_values, inplace=True)

In [59]:
replace_quanti = train[quantitative].mean()
train.fillna(replace_quanti, inplace=True)
#test.fillna(replace_quanti, inplace=True)

In [60]:
#replace_dates = train[dates].mean()
train[dates] = train[dates].fillna(method='pad')
#test[dates] = test[dates].fillna(method='pad')

### Feature ingineering

In [14]:
# feature engineering

# NATURE_CODE has always the same semantic. elements might be extracted.

# (history) temps depuis dernière visite (pas forcément dispo sur le test)
# (history) déjà eu une casse sur ce matériel
# (history) temps depuis dernière casse
# (history) la dernière visite date de moins de 6 mois
# (history) nb interventions faires par la ressource
# (history) temps depuis la première intervention de la ressource
# (contract history) nb de fois que le contrat a été mis à jour sur les X dernières années

In [15]:
# get features from dates

# todo: use dt series accessor
def add_dates_features(data):
    data['age_installation'] = (data['CRE_DATE_GZL'] - data['INSTALL_DATE']).dt.days // 365
    data['mois_appel'] = data['CRE_DATE_GZL'].map(lambda x: x.month)
    data['joursemaine_appel'] = data['CRE_DATE_GZL'].map(lambda x: x.isoweekday()) #integer, might be considered categorical
    data['jour_appel'] = data['CRE_DATE_GZL'].map(lambda x: x.day)
    data['mois_intervention'] = data['SCHEDULED_START_DATE'].map(lambda x: x.month)
    data['joursemaine_intervention'] = data['SCHEDULED_START_DATE'].map(lambda x: x.isoweekday()) #integer, might be considered categorical
    data['jour_intervention'] = data['SCHEDULED_START_DATE'].map(lambda x: x.day)
    data['duree_avant_intervention'] = (data['SCHEDULED_START_DATE'] - data['CRE_DATE_GZL']).dt.days
    data['duree_prevue'] = (data['SCHEDULED_END_DATE'] - data['SCHEDULED_START_DATE']).dt.days
    data['temps_depuis_debut_contrat'] = (data['CRE_DATE_GZL'] - data['DATE_DEBUT']).dt.days
    data['temps_jusqua_fin_contrat'] = (data['CRE_DATE_GZL'] - data['DATE_FIN']).dt.days  #souvent nan ? (mettre 0)
    data['temps_depuis_maj_contrat'] = (data['CRE_DATE_GZL'] - data['UPD_DATE']).dt.days 

    data.drop(['CRE_DATE_GZL', 'INSTALL_DATE', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE', 'DATE_DEBUT', 'DATE_FIN', 'UPD_DATE'], axis=1, inplace=True)
    return data



# get features from text fields
    # nb mots
    # présence de chiffres
    # mot en particulier
    # vide ou pas
    # Sequence mining + afd
    
# variables continues
    # créer des seuils pour introduire de la non linéarité ?


In [16]:
train = add_dates_features(train)
#test = add_dates_features(test)

### Group rare modalities

In [77]:
class RareModalitiesGrouper(BaseEstimator, TransformerMixin):
    '''Group rare modalities from categorical variables in a pandas dataframe in a RARE modality'''

    def __init__(self, columns, min_occurences):
        self.columns = columns
        self.min_occurences = min_occurences
        self.rare_modalities_dict = dict()
        
    def fit(self, df, y=None):
        for column in self.columns:
            value_counts = df[column].value_counts()
            rare_modalities = value_counts.where(value_counts < self.min_occurences).dropna().index
            self.rare_modalities_dict[column] = list(rare_modalities)
        return self

    def transform(self, df):
        _df = df.copy()
        for column in self.columns:
            mask = _df[column].isin(self.rare_modalities_dict[column])
            
            try:
                _df[column] = _df[column].cat.add_categories(['RARE'])
            except ValueError as e:
                print('Handled value error exception in column ', column, ': ', e)
            finally:    
                _df.loc[mask, column] = 'RARE'
                _df[column] = _df[column].cat.remove_unused_categories()
        return _df

In [78]:
rmg = RareModalitiesGrouper(['RESOURCE_ID'], 200)

In [79]:
rmg.fit(train)

RareModalitiesGrouper(columns=['RESOURCE_ID'], min_occurences=200)

In [80]:
train = rmg.transform(train)

## One hot encoding

In [22]:
class PandasOneHotEncoder(BaseEstimator, TransformerMixin):
    '''OneHotEncoder based on pandas.get_dummies which handle new modalities in transform step by ignoring them'''

    def __init__(self, columns, drop_first=False, sparse=False):
        self.columns = columns
        self.categories_dict = dict()
        self.drop_first = drop_first
        self.sparse = sparse
        
    def fit(self, df, y=None):
        self.categories_dict = {column: df[column].cat.categories for column in self.columns}
        return self

    def transform(self, df):
        _df = df.copy()
        for column in self.columns:
            _df[column] = _df[column].cat.set_categories(self.categories_dict[column])
            
        _df_encoded = pd.get_dummies(_df, columns=self.columns, drop_first=self.drop_first, sparse=self.sparse)
        #return scipy.sparse.csr_matrix(_df_encoded)
        return _df_encoded

### Normalize variance (mean + variance on numerical features)
note: we do not normalize mean because we are working with a sparse matrix here

In [24]:
#ssc = StandardScaler(with_mean=False)
#train_encoded = ssc.fit_transform(train_encoded)

# Modelisation

### split train / val

In [26]:
# split train in train, cv (will be replaced by cross validation for parameters tuning)
# stratify ?
X_train_train, X_train_val, y_train_train, y_train_val = sklearn.model_selection.train_test_split(train, y_train, random_state=101)

### Pipelines

In [27]:
pipeline = Pipeline([
    ('group_modalities', RareModalitiesGrouper(columns=categoricals, min_occurences=1000)),
    ('one_hot_encoding', PandasOneHotEncoder(columns=categoricals)),
    ('standardization', StandardScaler(with_mean=False)),
    ('dimensionality_reduction', IncrementalPCA(200)),
    #need to make a sparse matrix in output of standard scaler
    ('clf', XGBClassifier(learning_rate=0.1, 
                      n_estimators=140, 
                      max_depth=20,
                      min_child_weight=1, 
                      gamma=0, 
                      subsample=0.8, 
                      colsample_bytree=0.8,
                      objective= 'binary:logistic', 
                      nthread=7, 
                      scale_pos_weight=1, 
                      eval_metric='auc',
                      seed=29))
])

In [28]:
#TODO: try with std and PCA to see computation time
#TODO: finish feature engineering (without history)
#Try random forest with cross entropy
#Find best random forest and best xgboost models tester : (scale weights pos=sum(negative instances) / sum(positive instances))

In [29]:
%%time 
pipeline.fit(X=X_train_train, y=y_train_train)

CPU times: user 6h 21min 22s, sys: 14min 25s, total: 6h 35min 47s
Wall time: 1h 43min 37s


Pipeline(memory=None,
     steps=[('group_modalities', RareModalitiesGrouper(columns=Index(['INCIDENT_TYPE_ID', 'TYPE_BI', 'MILLESIME', 'PROBLEM_CODE',
       'ORIGINE_INCIDENT', 'GRAVITE', 'TYPE_OCC', 'MARQUE_LIB', 'USAGE_LOCAL',
       'PAYS', 'STOP_PHONING', 'CODE_GEN_EQUIPEMENT', 'CODE_FONCTION',
       'CODE_ENERGIE', 'C...tate=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=29, silent=True, subsample=0.8))])

In [30]:
pred = pipeline.predict(X_train_val)

In [31]:
sklearn.metrics.roc_auc_score(y_train_val, pred)

0.5708658049811135

### Gridsearch

In [None]:
from sklearn.linear_model import LogisticRegression
param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
                  clf=[SVC(), LogisticRegression()],
                  clf__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)

#### dimensionnality reduction
Failed because of memory error (16GO RAM): LDA, MCA, PCA, SparsePCA, SelectKBest <br>
Succeded: IncrementalPCA <br>
To Try on server: LDA, MCA

In [51]:
#LDA
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#lda = LinearDiscriminantAnalysis(n_components=20)
#X_train_train = lda.fit_transform(X_train_train, y=y_train_train)
#X_train_val = lda.transform(X_train_val, y=y_train_val)

### SVM

In [None]:
from sklearn import svm
clf = svm.LinearSVC(class_weight='balanced')
clf.fit(X_train_train_lda, y_train_train[-100000:])

In [None]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB(alpha=.01)
clf.fit(X_train_train[16:], y_train_train)

## Keras simple NN

## Logistic regression L2, L1