In [1]:
import datetime
import pandas as pd
import numpy as np
import datetime

import sklearn
from catboost import CatBoostClassifier, Pool, cv
from sklearn import model_selection , metrics   #Additional scklearn functions
from sklearn.preprocessing import StandardScaler

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from importlib import reload
import src.home_service
reload(src.home_service)
from src.home_service import *

In [4]:
train = pd.read_pickle('../data/merged_data/train.pkl')
train = train.iloc[0:5,:]
history = pd.read_pickle('../data/merged_data/history.pkl')

In [5]:
history = history.loc[~history.MILLESIME.isin(['2012.0', '2013.0', '2014.0']), :]

In [6]:
def drop_variables(df):
    drop = ['INSTANCE_ID',
    'INCIDENT_NUMBER',
#       'AUTEUR_INCIDENT', # 2088 modalities
        'TYPE_VOIE',
        'NATURE_CODE', # 313 modalities, need to be splitted in 5 modalities
#            'MARQUE_LIB', # 167 modalities
#            'OPTION', # 80 modalities, extract options
        'MODELE_CODE', # 10k modalities
#            'COMMENTAIRE_BI', # NLP 400k modalities
#             'RESOURCE_ID', # 4033 modalities
        'CODE_POSTAL', # 5800 modalities (only get first 2 numbers ?)
        'L2_ORGA_CODE_POSTAL', # 147 modalities (might be redondent with L2_ORGANISATION_ID)
#            'L2_ORGANISATION_ID' #151 modalities
        'L2_ORGA_VILLE', # 146, might be redondent with other organisation variables
        'RACHAT_CODE' # 312 modalities (try binarising ?)         
#            'CODE_INSTALLATION' # 17 modalities
       ]
    return df.drop(drop, axis=1)

In [7]:
categoricals = list(train.columns[train.dtypes == 'category'])
categoricals = list(set(categoricals).intersection(set(history.columns)))
quantitative = ['NB_PASSAGE', 'POINTS_FIDEL', 'CONTRAT_TARIF', 'PRIX_FACTURE']
dates = list(train.columns[train.dtypes == 'datetime64[ns]'])
dates = list(set(dates).intersection(set(history.columns)))

In [8]:
contract_variables = [ 'UPD_DATE', 'DATE_DEBUT', 'DATE_FIN', 'STS_CODE', 'OPTION', 'FORMULE', 'CONTRAT_TARIF', 'PRIX_FACTURE']

In [9]:
#impute without contract
categ_to_impute = list(set(categoricals) - set(contract_variables))
quanti_to_impute = list(set(quantitative) - set(contract_variables))
date_to_impute = list(set(dates) - set(contract_variables))

#impute contract
categ_contract = list(set(categoricals).intersection(set(contract_variables)))
quanti_contract = list(set(quantitative).intersection(set(contract_variables)))
date_contract = list(set(dates).intersection(set(contract_variables)))


#train and test are filled with values taken from train
#contract and other variables are imputed separatly (need to import some NAN in train set)
history, _ = impute_selected_variables(history, None, categ_to_impute, quanti_to_impute, date_to_impute)
history = impute_contract_variables(history, categ_contract, quanti_contract, date_contract)

In [10]:
def keep_train_features(history, train):
    canceled = ~history['MOTIF_ANNULATION_CODE'].isnull()
    cols_to_keep = list(set(train.columns) - set(['target']))
    history_light = history[cols_to_keep].copy()
    history_light['canceled'] = canceled
    
    return history_light

In [11]:
history = keep_train_features(history, train)
history = drop_variables(history)
history = commentaire_bi(history)

rmg = RareModalitiesGrouper(['COMMENTAIRE_BI'], 200)
rmg.fit(train)

history = rmg.transform(history)
#history = nature_code_split(history)
history = add_dates_features(history)
#history.dropna()

In [12]:
sc = StandardScaler()

nums = history.select_dtypes(include=['number']).columns
history.loc[:, nums] = sc.fit_transform(history[nums])

In [13]:
y = history['canceled']
X = history.drop('canceled', axis=1)

In [14]:
del history

## predict on train and test

In [15]:
pos_neg_ratio = sum(y==False) / sum(y==True)

In [16]:
categoricals = X.select_dtypes(include=['category', 'bool', 'object']).columns
categorical_features_indices = [X.columns.get_loc(cat) for cat in categoricals]

In [17]:
model = CatBoostClassifier(
    iterations=50,
    eval_metric="AUC",
    scale_pos_weight=pos_neg_ratio,
    learning_rate=0.5
)

In [18]:
model.fit(
    X, y,
    cat_features=categorical_features_indices,
    logging_level='Verbose'  # you can uncomment this for text output
)

0:	total: 17.3s	remaining: 14m 8s
1:	total: 34.8s	remaining: 13m 55s
2:	total: 49.7s	remaining: 12m 58s
3:	total: 1m 9s	remaining: 13m 24s
4:	total: 1m 27s	remaining: 13m 11s
5:	total: 1m 46s	remaining: 13m 1s
6:	total: 2m 2s	remaining: 12m 34s
7:	total: 2m 23s	remaining: 12m 30s
8:	total: 2m 38s	remaining: 12m 1s
9:	total: 2m 57s	remaining: 11m 48s
10:	total: 3m 11s	remaining: 11m 19s
11:	total: 3m 29s	remaining: 11m 4s
12:	total: 3m 47s	remaining: 10m 47s
13:	total: 4m 3s	remaining: 10m 26s
14:	total: 4m 20s	remaining: 10m 7s
15:	total: 4m 37s	remaining: 9m 49s
16:	total: 4m 55s	remaining: 9m 33s
17:	total: 5m 13s	remaining: 9m 17s
18:	total: 5m 36s	remaining: 9m 8s
19:	total: 5m 50s	remaining: 8m 46s
20:	total: 6m 6s	remaining: 8m 26s
21:	total: 6m 24s	remaining: 8m 9s
22:	total: 6m 43s	remaining: 7m 54s
23:	total: 7m 2s	remaining: 7m 37s
24:	total: 7m 23s	remaining: 7m 23s
25:	total: 7m 44s	remaining: 7m 8s
26:	total: 8m 8s	remaining: 6m 56s
27:	total: 8m 27s	remaining: 6m 38s
28:	

<catboost.core.CatBoostClassifier at 0x7f3802533e48>

In [19]:
cols_order = X.columns

In [20]:
del X, y

In [21]:
train = pd.read_pickle('../data/merged_data/train_preproc_.pickle')
train = train.drop('target', axis=1)

#train, _ = impute_selected_variables(train, None, categ_to_impute, quanti_to_impute, date_to_impute)
#train = impute_contract_variables(train)

#train = drop_variables(train)
#train = commentaire_bi(train)
#train = add_dates_features(train)
train = train[cols_order]

In [22]:
canceled_pred = model.predict(train)
canceled_proba_pred = model.predict_proba(train)[:, 1]

In [23]:
features_canceled = pd.DataFrame({'canceled_pred': canceled_pred,
                                 'canceled_proba_pred': canceled_proba_pred})
features_canceled.to_csv('features_canceled_train.csv', index=False, header=True)

In [24]:
test = pd.read_pickle('../data/merged_data/test_preproc_.pickle')
test.drop('canceled_proba_pred', axis=1, inplace=True)

#test, _ = impute_selected_variables(test, None, categ_to_impute, quanti_to_impute, date_to_impute)
#test = impute_contract_variables(test)

#test = drop_variables(test)
#test = commentaire_bi(test)
#test = add_dates_features(test)
test = test[cols_order]

In [25]:
canceled_pred = model.predict(test)
canceled_proba_pred = model.predict_proba(test)[:, 1]

In [26]:
features_canceled = pd.DataFrame({'canceled_pred': canceled_pred,
                                 'canceled_proba_pred': canceled_proba_pred})
features_canceled.to_csv('features_canceled_test.csv', index=False, header=True)

## train and validate

In [None]:
X_train_train, X_train_val, y_train_train, y_train_val = sklearn.model_selection.train_test_split(X, y, random_state=101)

In [None]:
del X, y

In [None]:
pos_neg_ratio =  sum(y_train_train==False) / sum(y_train_train==True)

In [None]:
pos_neg_ratio

In [None]:
categoricals = X_train_train.select_dtypes(include=['category', 'bool', 'object']).columns
categorical_features_indices = [X_train_train.columns.get_loc(cat) for cat in categoricals]

In [None]:
model = CatBoostClassifier(
    iterations=50,
    eval_metric="AUC",
    random_seed=42,
    od_type='Iter',
    od_wait=40,
    use_best_model=True,
    scale_pos_weight=pos_neg_ratio
)

In [None]:
model.fit(
    X_train_train, y_train_train,
    cat_features=categorical_features_indices,
    eval_set=(X_train_val, y_train_val),
    logging_level='Verbose'  # you can uncomment this for text output
)