In [1]:
import datetime
import pandas as pd
import numpy as np
import datetime

import sklearn
from catboost import CatBoostClassifier, Pool, cv
from sklearn import model_selection , metrics   #Additional scklearn functions

In [24]:
train = pd.read_pickle('../data/merged_data/train.pkl')
train = train.iloc[0:5,:]
history = pd.read_pickle('../data/merged_data/history.pkl')

In [25]:
history = history.loc[~history.MILLESIME.isin(['2012.0', '2013.0', '2014.0']), :]

In [26]:
def impute_selected_variables(df, test, categ, quanti, dates):
    _df = df.copy()
    _test = test.copy() if test is not None else None
    
    replace = _df[categ].mode()
    replace_values = {k:v.iloc[0] for k,v in replace.items()}
    _df.fillna(replace_values, inplace=True)

    replace_quanti = _df[quanti].mean()
    _df.fillna(replace_quanti, inplace=True)

    _df[dates] = _df[dates].fillna(method='pad')
    
    if test is not None:
        _test.fillna(replace_values, inplace=True)
        _test.fillna(replace_quanti, inplace=True)
        _test[dates] = _df[dates].fillna(method='pad')
    
    return _df, _test

In [27]:
#replace = train[categoricals].mode()
#replace_values = {k:v.iloc[0] for k,v in replace.items()}
def impute_contract_variables(df):
    _df = df.copy()
    
    for var in categ_contract:
        try:
            _df[var] = _df[var].fillna('NAN')
        except ValueError as e:
            _df[var] = _df[var].cat.add_categories(['NAN'])
            _df[var] = _df[var].fillna('NAN') 
        
    _df[quanti_contract] = _df[quanti_contract].fillna(-9999)
    _df[date_contract] = _df[date_contract].fillna(datetime.datetime(1970, 1, 1))
    return _df

In [28]:
categoricals = list(train.columns[train.dtypes == 'category'])
quantitative = ['NB_PASSAGE', 'POINTS_FIDEL', 'CONTRAT_TARIF', 'PRIX_FACTURE']
dates = list(train.columns[train.dtypes == 'datetime64[ns]'])

In [29]:
contract_variables = [ 'UPD_DATE', 'DATE_DEBUT', 'DATE_FIN', 'STS_CODE', 'OPTION', 'FORMULE', 'CONTRAT_TARIF', 'PRIX_FACTURE']

In [30]:
#impute without contract
categ_to_impute = list(set(categoricals) - set(contract_variables))
quanti_to_impute = list(set(quantitative) - set(contract_variables))
date_to_impute = list(set(dates) - set(contract_variables))

#impute contract
categ_contract = list(set(categoricals).intersection(set(contract_variables)))
quanti_contract = list(set(quantitative).intersection(set(contract_variables)))
date_contract = list(set(dates).intersection(set(contract_variables)))


#train and test are filled with values taken from train
#contract and other variables are imputed separatly (need to import some NAN in train set)
history, _ = impute_selected_variables(history, None, categ_to_impute, quanti_to_impute, date_to_impute)
history = impute_contract_variables(history)

In [31]:
def keep_train_features(history, train):
    cols_to_keep = list(set(train.columns) - set(['target']))
    history_light = history[cols_to_keep].copy()
    history_light['canceled'] = ~history['MOTIF_ANNULATION_CODE'].isnull()
    
    return history_light

In [32]:
def drop_variables(df):
    drop = ['INSTANCE_ID',
    'INCIDENT_NUMBER',
#       'AUTEUR_INCIDENT', # 2088 modalities
        'TYPE_VOIE',
        'NATURE_CODE', # 313 modalities, need to be splitted in 5 modalities
#            'MARQUE_LIB', # 167 modalities
#            'OPTION', # 80 modalities, extract options
        'MODELE_CODE', # 10k modalities
#            'COMMENTAIRE_BI', # NLP 400k modalities
#             'RESOURCE_ID', # 4033 modalities
        'CODE_POSTAL', # 5800 modalities (only get first 2 numbers ?)
        'L2_ORGA_CODE_POSTAL', # 147 modalities (might be redondent with L2_ORGANISATION_ID)
#            'L2_ORGANISATION_ID' #151 modalities
        'L2_ORGA_VILLE', # 146, might be redondent with other organisation variables
        'RACHAT_CODE' # 312 modalities (try binarising ?)         
#            'CODE_INSTALLATION' # 17 modalities
       ]
    return df.drop(drop, axis=1)

In [33]:
def commentaire_bi(df):
    _df = df.copy()
    
    _df.COMMENTAIRE_BI = _df.COMMENTAIRE_BI.str.upper()
    COMMENTAIRE_BI_vc = _df.COMMENTAIRE_BI.value_counts()
    common_commentaire_bi = COMMENTAIRE_BI_vc[COMMENTAIRE_BI_vc > 100].index
    _df['COMMENTAIRE_BI_common'] = _df.COMMENTAIRE_BI.where(_df.COMMENTAIRE_BI.isin(common_commentaire_bi), "Rare")
    
    _df['nb_char_commentaire'] = [len(txt) for txt in _df.COMMENTAIRE_BI]
    _df['nb_mots_commentaire'] = [len(txt.split()) for txt in _df.COMMENTAIRE_BI]
    _df['has_number_commentaire'] = [any(char.isdigit() for char in txt) for txt in _df.COMMENTAIRE_BI]
    _df['is_empty_commentaire'] = [(txt == '.') for txt in _df.COMMENTAIRE_BI]
    _df.drop('COMMENTAIRE_BI', axis=1, inplace=True)
    
    return _df

In [34]:
# todo: use dt series accessor
def add_dates_features(data):
    data['age_installation'] = (data['CRE_DATE_GZL'] - data['INSTALL_DATE']).dt.days // 365
    data['mois_appel'] = data['CRE_DATE_GZL'].map(lambda x: x.month)
    data['joursemaine_appel'] = data['CRE_DATE_GZL'].map(lambda x: x.isoweekday()) #integer, might be considered categorical
    data['jour_appel'] = data['CRE_DATE_GZL'].map(lambda x: x.day)
    data['mois_intervention'] = data['SCHEDULED_START_DATE'].map(lambda x: x.month)
    data['joursemaine_intervention'] = data['SCHEDULED_START_DATE'].map(lambda x: x.isoweekday()) #integer, might be considered categorical
    data['jour_intervention'] = data['SCHEDULED_START_DATE'].map(lambda x: x.day)
    data['duree_avant_intervention'] = (data['SCHEDULED_START_DATE'] - data['CRE_DATE_GZL']).dt.days
    data['duree_prevue'] = (data['SCHEDULED_END_DATE'] - data['SCHEDULED_START_DATE']).dt.days
    data['temps_depuis_debut_contrat'] = (data['CRE_DATE_GZL'] - data['DATE_DEBUT']).dt.days
    data['temps_jusqua_fin_contrat'] = (data['CRE_DATE_GZL'] - data['DATE_FIN']).dt.days  #souvent nan ? (mettre 0)
    data['temps_depuis_maj_contrat'] = (data['CRE_DATE_GZL'] - data['UPD_DATE']).dt.days 

    data.drop(['CRE_DATE_GZL', 'INSTALL_DATE', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE', 'DATE_DEBUT', 'DATE_FIN', 'UPD_DATE'], axis=1, inplace=True)
    return data

In [35]:
history = keep_train_features(history, train)
history = drop_variables(history)
history = commentaire_bi(history)
history = add_dates_features(history)
#history.dropna()

In [36]:
y = history['canceled']
X = history.drop('canceled', axis=1)

In [15]:
del history

## predict on train and test

In [16]:
pos_neg_ratio = sum(y==False) / sum(y==True)

In [17]:
categoricals = X.select_dtypes(include=['category', 'bool', 'object']).columns
categorical_features_indices = [X.columns.get_loc(cat) for cat in categoricals]

In [18]:
model = CatBoostClassifier(
    iterations=50,
    eval_metric="AUC",
    scale_pos_weight=pos_neg_ratio,
    learning_rate=0.5
)

In [19]:
model.fit(
    X, y,
    cat_features=categorical_features_indices,
    logging_level='Verbose'  # you can uncomment this for text output
)

0:	total: 10.3s	remaining: 8m 26s
1:	total: 25.2s	remaining: 10m 4s
2:	total: 40.6s	remaining: 10m 35s
3:	total: 55.8s	remaining: 10m 41s
4:	total: 1m 8s	remaining: 10m 19s
5:	total: 1m 21s	remaining: 9m 54s
6:	total: 1m 34s	remaining: 9m 41s
7:	total: 1m 47s	remaining: 9m 25s
8:	total: 2m	remaining: 9m 9s
9:	total: 2m 10s	remaining: 8m 40s
10:	total: 2m 22s	remaining: 8m 25s
11:	total: 2m 35s	remaining: 8m 11s
12:	total: 2m 46s	remaining: 7m 54s
13:	total: 2m 58s	remaining: 7m 39s
14:	total: 3m 10s	remaining: 7m 25s
15:	total: 3m 23s	remaining: 7m 11s
16:	total: 3m 35s	remaining: 6m 57s
17:	total: 3m 50s	remaining: 6m 50s
18:	total: 4m 5s	remaining: 6m 40s
19:	total: 4m 18s	remaining: 6m 28s
20:	total: 4m 34s	remaining: 6m 19s
21:	total: 4m 46s	remaining: 6m 4s
22:	total: 4m 57s	remaining: 5m 49s
23:	total: 5m 12s	remaining: 5m 38s
24:	total: 5m 25s	remaining: 5m 25s
25:	total: 5m 38s	remaining: 5m 12s
26:	total: 5m 54s	remaining: 5m 1s
27:	total: 6m 7s	remaining: 4m 48s
28:	total: 6m

<catboost.core.CatBoostClassifier at 0x7fda47794400>

In [20]:
del X, y

In [38]:
train = pd.read_pickle('../data/merged_data/train.pkl')
train = train.drop('target', axis=1)

train, _ = impute_selected_variables(train, None, categ_to_impute, quanti_to_impute, date_to_impute)
train = impute_contract_variables(train)

train = drop_variables(train)
train = commentaire_bi(train)
train = add_dates_features(train)
train = train[X.columns]

In [49]:
canceled_pred = model.predict(train)
canceled_proba_pred = model.predict_proba(train)[:, 1]

In [53]:
features_canceled = pd.DataFrame({'canceled_pred': canceled_pred,
                                 'canceled_proba_pred': canceled_proba_pred})
features_canceled.to_csv('features_canceled_train.csv', index=False, header=True)

In [50]:
test = pd.read_pickle('../data/merged_data/test.pkl')

test, _ = impute_selected_variables(test, None, categ_to_impute, quanti_to_impute, date_to_impute)
test = impute_contract_variables(test)

test = drop_variables(test)
test = commentaire_bi(test)
test = add_dates_features(test)
test = test[X.columns]

In [54]:
canceled_pred = model.predict(test)
canceled_proba_pred = model.predict_proba(test)[:, 1]

In [55]:
features_canceled = pd.DataFrame({'canceled_pred': canceled_pred,
                                 'canceled_proba_pred': canceled_proba_pred})
features_canceled.to_csv('features_canceled_test.csv', index=False, header=True)

## train and validate

In [None]:
X_train_train, X_train_val, y_train_train, y_train_val = sklearn.model_selection.train_test_split(X, y, random_state=101)

In [None]:
del X, y

In [None]:
pos_neg_ratio =  sum(y_train_train==False) / sum(y_train_train==True)

In [None]:
pos_neg_ratio

In [None]:
categoricals = X_train_train.select_dtypes(include=['category', 'bool', 'object']).columns
categorical_features_indices = [X_train_train.columns.get_loc(cat) for cat in categoricals]

In [None]:
model = CatBoostClassifier(
    iterations=50,
    eval_metric="AUC",
    random_seed=42,
    od_type='Iter',
    od_wait=40,
    use_best_model=True,
    scale_pos_weight=pos_neg_ratio
)

In [None]:
model.fit(
    X_train_train, y_train_train,
    cat_features=categorical_features_indices,
    eval_set=(X_train_val, y_train_val),
    logging_level='Verbose'  # you can uncomment this for text output
)