In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import data

In [3]:
data_repo = "/home/julien/src/DSC_HOME_SERVICE/data/"
import_params = {'sep': '|', 'encoding': 'latin-1'}

In [4]:
date_cols = ['CRE_DATE', 'UPD_DATE', 'DATE_RESILIATION', 'DATE_DEBUT', 'DATE_FIN']
contract_history = pd.read_csv(data_repo + 'contract_history.csv', dtype={'NUM_CAMPAGNE': object}, parse_dates=date_cols, **import_params)

In [84]:
contract_history.dtypes

STS_CODE                         object
CONTRACT_NUMBER                   int64
CONTRACT_MODIFICATEUR            object
CRE_DATE                 datetime64[ns]
UPD_DATE                 datetime64[ns]
DATE_RESILIATION         datetime64[ns]
DATE_DEBUT               datetime64[ns]
DATE_FIN                 datetime64[ns]
INSTANCE_ID                       int64
FORMULE                          object
OPTION                           object
CONTRAT_TARIF                     int64
PRIX_FACTURE                      int64
CONDITION_REGLEMENT              object
MOTIF_RESILIATION                object
RENOUVELLEMENT_AGENCE            object
PRIX_FORMULE                      int64
PRIX_OPTION                       int64
NUM_CAMPAGNE                     object
dtype: object

In [49]:
# normalization to put everything in float and convert "N" to nan
contract_history['NUM_CAMPAGNE'] = contract_history['NUM_CAMPAGNE'].map(lambda x: float(x) if x not in ['N', ''] else np.nan)

In [38]:
equipment = pd.read_csv(data_repo + 'equipment.csv', dtype={'CODE_POSTAL': object}, **import_params)

In [40]:
equipment.dtypes

INSTANCE_ID                 int64
LOCATION_ID                 int64
ORGANISATION_ID             int64
PARTY_ID_OCC                int64
TYPE_OCC                   object
INSTALL_DATE               object
RACHAT_CODE                object
RACHAT_LIB                 object
RACHAT_DATE                object
NATURE_CODE                object
MARQUE_CODE               float64
MARQUE_LIB                 object
MODELE_CODE                object
MODELE_LIB                 object
USAGE_LOCAL                object
LOCALISATION_ORGANISME     object
COMPLEMENT_RUE             object
CODE_POSTAL                object
ESCALIER                   object
ETAGE                      object
NUMERO                     object
RUE                        object
PAYS                       object
TYPE_VOIE                  object
VILLE                      object
POINTS_FIDEL              float64
STOP_PHONING               object
CODE_GEN_EQUIPEMENT        object
CODE_FONCTION              object
CODE_ENERGIE  

In [44]:
code_eau_chaude = pd.read_csv(data_repo + 'nature_code_eau_chaude.csv', sep='|')
code_energie = pd.read_csv(data_repo + 'nature_code_energie.csv', sep='|')
code_fonction = pd.read_csv(data_repo + 'nature_code_fonction.csv', sep='|')
code_installation = pd.read_csv(data_repo + 'nature_code_installation.csv', sep='|')
code_specification = pd.read_csv(data_repo + 'nature_code_specification.csv', sep='|')

In [5]:
date_cols = ['CRE_DATE_GZL', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE']
date_cols_history = ['ACTUAL_START_DATE', 'ACTUAL_END_DATE']

intervention_test = pd.read_csv(data_repo + 'intervention_test.csv', sep='|', encoding='latin-1', parse_dates=date_cols)
intervention_history = pd.read_csv(data_repo + 'intervention_history.csv', sep='|', encoding='latin-1', parse_dates=date_cols+date_cols_history)
intervention_train = pd.read_csv(data_repo + 'intervention_train.csv', sep='|', encoding='latin-1', parse_dates=date_cols)

In [134]:
intervention_test.dtypes

INSTANCE_ID                      int64
INCIDENT_NUMBER                  int64
INCIDENT_TYPE_ID                 int64
INCIDENT_TYPE_NAME              object
TYPE_BI                         object
NB_PASSAGE                     float64
MILLESIME                      float64
PROBLEM_CODE                    object
PROBLEM_DESC                   float64
AUTEUR_INCIDENT                  int64
ORIGINE_INCIDENT                object
COMMENTAIRE_BI                  object
SS_TR_FLAG                      object
TYPE_UT                         object
GRAVITE                         object
RESOURCE_ID                    float64
SCHEDULED_START_DATE    datetime64[ns]
SCHEDULED_END_DATE      datetime64[ns]
CRE_DATE_GZL            datetime64[ns]
dtype: object

In [52]:
intervention_train.dtypes

INSTANCE_ID               int64
INCIDENT_NUMBER           int64
INCIDENT_TYPE_ID          int64
INCIDENT_TYPE_NAME       object
TYPE_BI                  object
NB_PASSAGE              float64
MILLESIME               float64
PROBLEM_CODE             object
PROBLEM_DESC            float64
AUTEUR_INCIDENT           int64
ORIGINE_INCIDENT         object
COMMENTAIRE_BI           object
SS_TR_FLAG               object
TYPE_UT                  object
GRAVITE                  object
RESOURCE_ID             float64
SCHEDULED_START_DATE     object
SCHEDULED_END_DATE       object
CRE_DATE_GZL             object
target                    int64
dtype: object

In [14]:
intervention_history.dtypes

INSTANCE_ID                        int64
INCIDENT_NUMBER                    int64
DATE_SAISIE_RETOUR                object
INCIDENT_TYPE_ID                   int64
INCIDENT_TYPE_NAME                object
INCIDENT_STATUS_ID                 int64
INCIDENT_STATUS_NAME              object
INCIDENT_STATUS_CODE              object
INCIDENT_CLOSED_FLAG              object
TYPE_BI                           object
NB_PASSAGE                       float64
MILLESIME                        float64
PROBLEM_CODE                      object
PROBLEM_DESC                     float64
MOTIF_A_REVOIR                   float64
AUTEUR_INCIDENT                    int64
ORIGINE_INCIDENT                  object
COMMENTAIRE_BI                    object
SS_TR_FLAG                        object
TYPE_UT                           object
GRAVITE                           object
COMMENTAIRE_FIN_INTERV            object
MOTIF_ANNULATION_CODE             object
MOTIF_ANNULATION_DESC             object
RESOURCE_ID     

In [62]:
organisation = pd.read_csv(data_repo + 'organisation.csv', **import_params)

In [63]:
organisation.dtypes

L1_ORGANISATION_ID      int64
L1_NAME                object
L2_ORGANISATION_ID      int64
L2_NAME                object
ADRESSE                object
L2_ORGA_CODE_POSTAL     int64
L2_ORGA_VILLE          object
CIA                     int64
ORGANISATION_CODE      object
dtype: object

In [None]:
# what's the point of separating train and history ?

In [62]:
print(intervention_history.shape)
print(intervention_train.shape)
print(intervention_test.shape)

(5171308, 30)
(1048030, 20)
(226770, 19)


In [63]:
intervention_test['CRE_DATE_GZL'].min(), intervention_test['CRE_DATE_GZL'].max()

('2014-01-06', '2017-10-21')

In [60]:
intervention_train['CRE_DATE_GZL'].min(), intervention_train['CRE_DATE_GZL'].max()

('2014-01-06', '2017-10-21')

In [61]:
intervention_history['CRE_DATE_GZL'].min(), intervention_history['CRE_DATE_GZL'].max()

('2014-01-02', '2018-04-26')

## Are the observation in train and test also in history ? : 
train = yes
<br>
test = no --> impossible to build features based on history
<br>
note: 
* neither of tuples ('INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER') nor ('INSTANCE_ID', 'CRE_DATE_GZL') in test set are present in history
* 'INSTANCE_ID' in test set are presents in history

** to create features based on history, we have to join test and history to gather the maximum of informations (if we can build intereting features based on that join **

In [34]:
# common observations in history and train ?
intervention_train.reset_index(inplace=True)
intervention_train.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [35]:
intervention_test.reset_index(inplace=True)
intervention_test.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [36]:
intervention_history.reset_index(inplace=True)
intervention_history.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [37]:
intervention_history.columns

Index(['index', 'DATE_SAISIE_RETOUR', 'INCIDENT_TYPE_ID', 'INCIDENT_TYPE_NAME',
       'INCIDENT_STATUS_ID', 'INCIDENT_STATUS_NAME', 'INCIDENT_STATUS_CODE',
       'INCIDENT_CLOSED_FLAG', 'TYPE_BI', 'NB_PASSAGE', 'MILLESIME',
       'PROBLEM_CODE', 'PROBLEM_DESC', 'MOTIF_A_REVOIR', 'AUTEUR_INCIDENT',
       'ORIGINE_INCIDENT', 'COMMENTAIRE_BI', 'SS_TR_FLAG', 'TYPE_UT',
       'GRAVITE', 'COMMENTAIRE_FIN_INTERV', 'MOTIF_ANNULATION_CODE',
       'MOTIF_ANNULATION_DESC', 'RESOURCE_ID', 'SCHEDULED_START_DATE',
       'SCHEDULED_END_DATE', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE'],
      dtype='object')

In [38]:
intervention_train.sort_index(inplace=True)
intervention_test.sort_index(inplace=True)
intervention_history.sort_index(inplace=True)
intervention_train.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,INCIDENT_TYPE_ID,INCIDENT_TYPE_NAME,TYPE_BI,NB_PASSAGE,MILLESIME,PROBLEM_CODE,PROBLEM_DESC,AUTEUR_INCIDENT,ORIGINE_INCIDENT,COMMENTAIRE_BI,SS_TR_FLAG,TYPE_UT,GRAVITE,RESOURCE_ID,SCHEDULED_START_DATE,SCHEDULED_END_DATE,target
INSTANCE_ID,CRE_DATE_GZL,INCIDENT_NUMBER,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10003,2016-12-19,80032876,538879,10005,Entretien,Entretien,1.0,2017.0,,,1513,Automatique,.,N,UT01,-,102107471.0,2017-01-19,2017-01-19,0
10003,2017-01-19,81119814,381986,10005,Entretien,Entretien,1.0,2017.0,,,33678,Téléphone,ARM VE,N,UT01,-,100000747.0,2017-02-02,2017-02-02,0
10003,2017-02-01,81505320,254996,10005,Entretien,Entretien,1.0,2017.0,,,1488,Téléphone,A-M / VISITE / REPORT AG. 02/02,N,UT01,-,102107471.0,2017-03-01,2017-03-01,0
10015,2015-11-12,68333220,776854,10005,Entretien,Entretien,1.0,2015.0,,,24270,Automatique,.,N,UT01,-,101895406.0,2015-11-26,2015-11-26,0
10015,2016-07-25,75830488,814224,10005,Entretien,Entretien,1.0,2016.0,,,1513,Automatique,.,N,UT01,-,102967936.0,2016-09-16,2016-09-16,0
10017,2016-12-23,80266556,164690,10005,Entretien,Entretien,1.0,2017.0,,,24270,Automatique,A-M / VISITE D ENTRETIEN,N,UT01,-,100000115.0,2017-02-13,2017-02-13,0
10021,2017-03-20,83022676,480769,10005,Entretien,Entretien,1.0,2017.0,,,1486,Téléphone,MATIN VE A CONFIRMER /MESSAGE REPONDEUR LE 20....,N,UT01,-,103379105.0,2017-03-29,2017-03-29,0
10021,2017-03-30,83345306,10698,10005,Entretien,Entretien,2.0,2017.0,,,1486,Téléphone,am ve,N,UT01,-,103379105.0,2017-04-03,2017-04-03,0
10023,2016-02-03,70993496,955900,10005,Entretien,Entretien,1.0,2016.0,,,1513,Automatique,.,N,UT01,-,101835384.0,2016-02-17,2016-02-17,0
10023,2016-02-08,71169146,326080,10005,Entretien,Entretien,1.0,2016.0,,,1483,Téléphone,DES 14H -- VISITE ENTRETIEN,N,UT01,-,102072460.0,2016-03-02,2016-03-02,1


In [31]:
# are the observations in train in history ?
commmon_intervention_train = intervention_train.index.intersection(intervention_history.index)

In [32]:
commmon_intervention_test = intervention_test.index.intersection(intervention_history.index)

In [33]:
# train file is an extract from history file with target variable added
print(commmon_intervention_train.values.shape)
print(commmon_intervention_test.values.shape)

(4255404,)
(1339681,)


In [29]:
history_with_target = intervention_history.join(intervention_train['target'], how='left')

In [25]:
history_with_target[['target', 'time_delta', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE', 'INCIDENT_TYPE_NAME', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE',  'INCIDENT_STATUS_NAME', 'TYPE_BI', 'MOTIF_ANNULATION_CODE', 'MOTIF_ANNULATION_DESC']][0:50]

NameError: name 'history_with_target' is not defined

## cleaning data

In [39]:
# common observations in history and train ?
intervention_train.reset_index(inplace=True)
intervention_train.set_index(['INSTANCE_ID', 'CRE_DATE_GZL'], inplace=True)

In [40]:
intervention_test.reset_index(inplace=True)
intervention_test.set_index(['INSTANCE_ID', 'CRE_DATE_GZL'], inplace=True)

In [41]:
intervention_history.reset_index(inplace=True)
intervention_history.set_index(['INSTANCE_ID', 'CRE_DATE_GZL'], inplace=True)

In [42]:
# duplicates
#intervention_train.drop_duplicates(inplace=True)
intervention_train.index.duplicated()

array([False, False, False, ..., False, False, False], dtype=bool)

In [43]:
# several events can be created the same day for the same instance in the train test but not in the test set. 
# Sometime they look like duplicates and sometimes not.
# they can have a different target, apparently depending on the scheduled date
# as I connot establish a clear rule to apply on them, I will keep all lines except pure duplicates (index and values)

# todo? : extract feature: nb of lines for each couple (instance_id, creation_date) ? 
#    --> not useful since it's always only on in test set
intervention_train[intervention_train.index.duplicated(keep=False)]

Unnamed: 0_level_0,Unnamed: 1_level_0,INCIDENT_NUMBER,index,INCIDENT_TYPE_ID,INCIDENT_TYPE_NAME,TYPE_BI,NB_PASSAGE,MILLESIME,PROBLEM_CODE,PROBLEM_DESC,AUTEUR_INCIDENT,ORIGINE_INCIDENT,COMMENTAIRE_BI,SS_TR_FLAG,TYPE_UT,GRAVITE,RESOURCE_ID,SCHEDULED_START_DATE,SCHEDULED_END_DATE,target
INSTANCE_ID,CRE_DATE_GZL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10150,2017-07-13,86270628,547368,10003,Dépannage,Dépannage,1.0,,,,42405,Téléphone,PLUS D EAU CHAUDE / 14H00-18H00,N,UT02,PP,100000146.0,2017-07-13,2017-07-13,1
10150,2017-07-13,86283522,437576,10003,Dépannage,Dépannage,1.0,,,,36927,Téléphone,pièce OKéchangeur MTS61302409S 1H00 MATIN,N,UT02,PP,100000146.0,2017-07-15,2017-07-15,1
10218,2017-04-04,83463744,155703,10003,Dépannage,Dépannage,1.0,,,,1614,Téléphone,s est mise à vibrer hier soir a tout coupé car...,N,UT02,PP,101453303.0,2017-04-04,2017-04-04,0
10218,2017-04-04,83486896,200778,10003,Dépannage,Dépannage,1.0,,,,1614,Téléphone,+EC,N,UT02,PP,101453303.0,2017-04-04,2017-04-04,0
10270,2015-09-11,66311942,1047707,10005,Entretien,Entretien,1.0,2015.0,,,1614,Téléphone,VE//REPORT DU 17.09,N,UT01,-,100000747.0,2015-09-17,2015-09-17,0
10270,2015-09-11,66311982,345120,10005,Entretien,Entretien,1.0,2015.0,,,1614,Téléphone,14H//REPORT VE DU 17.09,N,UT01,-,100000747.0,2015-10-07,2015-10-07,0
10282,2016-12-22,80204534,883195,10003,Dépannage,Dépannage,,,,,1614,Téléphone,NVX PROPRIET MR KERUZORE 0781588939//DEP PAYAN...,N,UT02,PP,102072460.0,2016-12-26,2016-12-26,0
10282,2016-12-22,80207526,987422,10003,Dépannage,Dépannage,1.0,,,,1614,Téléphone,IMPERATIF PRENDRE CONTRAT SIGNE SUR PLACE/PBL ...,N,UT02,PP,102072460.0,2016-12-26,2016-12-26,1
10449,2016-07-05,75380652,553361,10005,Entretien,Entretien,1.0,2016.0,,,21231,Téléphone,VE,N,UT01,-,103080996.0,2016-07-05,2016-07-05,0
10449,2016-07-05,75400802,563673,10003,Dépannage,Dépannage,1.0,,,,1489,Téléphone,A.M - EN SECURITE,N,UT02,PP,103080996.0,2016-07-05,2016-07-05,0


In [44]:
intervention_test[intervention_test.index.duplicated(keep=False)]

Unnamed: 0_level_0,Unnamed: 1_level_0,INCIDENT_NUMBER,index,INCIDENT_TYPE_ID,INCIDENT_TYPE_NAME,TYPE_BI,NB_PASSAGE,MILLESIME,PROBLEM_CODE,PROBLEM_DESC,AUTEUR_INCIDENT,ORIGINE_INCIDENT,COMMENTAIRE_BI,SS_TR_FLAG,TYPE_UT,GRAVITE,RESOURCE_ID,SCHEDULED_START_DATE,SCHEDULED_END_DATE
INSTANCE_ID,CRE_DATE_GZL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [136]:
# check NA
intervention_train.isnull().sum()

INSTANCE_ID                   0
INCIDENT_NUMBER               0
INCIDENT_TYPE_ID              0
INCIDENT_TYPE_NAME            0
TYPE_BI                       0
NB_PASSAGE                27046
MILLESIME                292829
PROBLEM_CODE            1041360
PROBLEM_DESC            1048030
AUTEUR_INCIDENT               0
ORIGINE_INCIDENT            253
COMMENTAIRE_BI              515
SS_TR_FLAG                    0
TYPE_UT                       0
GRAVITE                       0
RESOURCE_ID                2350
SCHEDULED_START_DATE          0
SCHEDULED_END_DATE            0
CRE_DATE_GZL                  0
target                        0
dtype: int64

In [137]:
def nan_check(series):
    print(series.name + '\n')
    print(series.isnull().sum())
    print(series.unique())
    print(series.value_counts())
    print('\n' * 3)

In [138]:
columns_with_nan = intervention_train.isnull().any()

In [139]:
intervention_train.loc[:, columns_with_nan].head(100)

Unnamed: 0,NB_PASSAGE,MILLESIME,PROBLEM_CODE,PROBLEM_DESC,ORIGINE_INCIDENT,COMMENTAIRE_BI,RESOURCE_ID
0,1.0,2017.0,,,Téléphone,.,101629356.0
1,1.0,2017.0,,,Téléphone,1er rdv,100206988.0
2,2.0,2013.0,,,Téléphone,16H45-VE,102156488.0
3,1.0,2014.0,,,Téléphone,.,102334599.0
4,1.0,2016.0,,,Téléphone,VTE 2016 APRES 9H,102460713.0
5,1.0,2017.0,,,Téléphone,ARM : SERENITE DV. RECUP.CONTRAT. VERIF TYPE A...,102983952.0
6,1.0,,,,Téléphone,PAS D EAU CHAUDE + PAS DE PRESSION VU LE 26.0...,100058901.0
7,1.0,2016.0,,,Téléphone,.,100663106.0
8,1.0,2017.0,,,Téléphone,ARM VE,102814875.0
9,1.0,2017.0,,,Automatique,.,103362101.0


In [52]:
intervention_train.loc[:, columns_with_nan].apply(nan_check, axis=0)

NB_PASSAGE

27046
[1.0 2.0 nan 3.0 4.0 5.0 6.0]
1.0    952087
2.0     64767
3.0      3957
4.0       166
5.0         6
6.0         1
Name: NB_PASSAGE, dtype: int64




MILLESIME

292829
[2017.0 2015.0 2016.0 nan 2014.0 2013.0 2018.0 2012.0]
2016.0    379902
2017.0    281321
2015.0     71502
2014.0     20021
2013.0      2408
2012.0        36
2018.0        11
Name: MILLESIME, dtype: int64




PROBLEM_CODE

1041360
[nan 'FE' 'EC' 'CH' 'AB' 'SC' 'TA' 'PU' 'AL' 'FG' 'PR' 'DJ' 'VM' 'VE']
FE    1517
CH    1307
EC    1301
SC    1217
AB     492
PR     363
TA     157
AL      84
FG      80
VE      74
DJ      45
PU      30
VM       3
Name: PROBLEM_CODE, dtype: int64




PROBLEM_DESC

1048030
[nan]
Series([], Name: PROBLEM_DESC, dtype: int64)




ORIGINE_INCIDENT

253
['Automatique' 'Téléphone' 'Courrier' 'Venue client' 'Mail' nan 'Répondeur'
 'Télé-alarme' 'Internet']
Téléphone       673242
Automatique     372626
Répondeur          851
Courrier           466
Mail               263
Venue client     

Name: RESOURCE_ID, Length: 4032, dtype: int64






NB_PASSAGE          None
MILLESIME           None
PROBLEM_CODE        None
PROBLEM_DESC        None
ORIGINE_INCIDENT    None
COMMENTAIRE_BI      None
RESOURCE_ID         None
dtype: object

**NB_PASSAGE** : nan will be replaced by value 1
<br>
**MILLESIME** : extract year from creation event date
<br>
**PROBLEM_CODE** : added a modality 'NC' for 'not communicated'
<br>
**PROBLEM_DESC** : variable is always nan, it will be removed
<br>
**ORIGINE_INCIDENT** : modality 'telephone' will be applied (could run some chi square test to see if there is any different between telephone and automatique regarding target variable)
<br>
**COMMENTAIRE_BI** : nan will be replaced by '.'
<br>
**RESOURCE_ID** : nan will be replaced by 0

In [7]:
# rewrite in a function using fillna (improve readability)

intervention_train.loc[intervention_train.NB_PASSAGE.isnull(), 'NB_PASSAGE'] = 1
intervention_train.loc[intervention_train.PROBLEM_CODE.isnull(), 'PROBLEM_CODE'] = 'NC'
intervention_train.loc[intervention_train.ORIGINE_INCIDENT.isnull(), 'ORIGINE_INCIDENT'] = 'NAN'
intervention_train.loc[intervention_train.COMMENTAIRE_BI.isnull(), 'COMMENTAIRE_BI'] = '.'
intervention_train.loc[intervention_train.RESOURCE_ID.isnull(), 'RESOURCE_ID'] = 0

In [128]:
intervention_train.index

RangeIndex(start=0, stop=1048030, step=1)

In [168]:
intervention_train.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1048030 entries, (10003, 2016-12-19 00:00:00) to (57307101, 2017-10-19 00:00:00)
Data columns (total 18 columns):
INCIDENT_NUMBER         1048030 non-null int64
INCIDENT_TYPE_ID        1048030 non-null int64
INCIDENT_TYPE_NAME      1048030 non-null object
TYPE_BI                 1048030 non-null object
NB_PASSAGE              1048030 non-null float64
MILLESIME               1048030 non-null int64
PROBLEM_CODE            1048030 non-null object
PROBLEM_DESC            0 non-null float64
AUTEUR_INCIDENT         1048030 non-null int64
ORIGINE_INCIDENT        1048030 non-null object
COMMENTAIRE_BI          1048030 non-null object
SS_TR_FLAG              1048030 non-null object
TYPE_UT                 1048030 non-null object
GRAVITE                 1048030 non-null object
RESOURCE_ID             1048030 non-null int64
SCHEDULED_START_DATE    1048030 non-null datetime64[ns]
SCHEDULED_END_DATE      1048030 non-null datetime64[ns]
target       

In [95]:
#TODO: create a function to do that
# not used here
nb_missing = sum(intervention_train.MILLESIME.isnull())
distrib = intervention_train.MILLESIME.value_counts()
weigths = distrib / distrib.sum()

fill_values = np.random.choice(distrib.index, 
                                 nb_missing,
                                 p=weigths)

intervention_train.loc[intervention_train.MILLESIME.isnull(), 'MILLESIME'] = fill_values

pd.Series(fill_values).value_counts()

In [8]:
#fill the missing millesime by year of creation date
missing_millesime = intervention_train.MILLESIME.isnull()
intervention_train.loc[missing_millesime, 'MILLESIME'] = intervention_train.loc[missing_millesime, 'CRE_DATE_GZL'].apply(lambda x: x.year)

In [9]:
intervention_train.drop('PROBLEM_DESC', axis=1, inplace=True)

In [10]:
# check no more missing values
intervention_train.isnull().sum()

INSTANCE_ID             0
INCIDENT_NUMBER         0
INCIDENT_TYPE_ID        0
INCIDENT_TYPE_NAME      0
TYPE_BI                 0
NB_PASSAGE              0
MILLESIME               0
PROBLEM_CODE            0
AUTEUR_INCIDENT         0
ORIGINE_INCIDENT        0
COMMENTAIRE_BI          0
SS_TR_FLAG              0
TYPE_UT                 0
GRAVITE                 0
RESOURCE_ID             0
SCHEDULED_START_DATE    0
SCHEDULED_END_DATE      0
CRE_DATE_GZL            0
target                  0
dtype: int64

In [11]:
# change type of int variables (from float to int, possible because no more nans)
intervention_train['MILLESIME'] = intervention_train['MILLESIME'].astype(int)
intervention_train['RESOURCE_ID'] = intervention_train['RESOURCE_ID'].astype(int)

### check ORIGINE_INCIDENT is not independant from target (it is not)

on the full table

In [114]:
import scipy

In [112]:
full_contingency = np.array(pd.crosstab(index=intervention_train['target'], columns=intervention_train['ORIGINE_INCIDENT']))

In [117]:
#origine incident is not independant from target
p_value = scipy.stats.chi2_contingency(full_contingency)[1]
print(p_value)

0.0


In [122]:
telephone_or_auto = intervention_train['ORIGINE_INCIDENT'].isin(['Téléphone', 'Automatique'])

In [123]:
intervention_train_extract = intervention_train.loc[telephone_or_auto, :]

only on auto or telephone modalities

In [124]:
limited_contingency =  np.array(pd.crosstab(index=intervention_train_extract['target'], columns=intervention_train_extract['ORIGINE_INCIDENT']))

In [142]:
p_value = scipy.stats.chi2_contingency(limited_contingency)[1]
print(p_value)

0.0


Origine_incident is correlated to target.
Question: what to do in test set ?
- option 1: infer from a model
- option 2: create a specific modality <---
- option 3: fill with specific value
- option 4: fill randomly

In [127]:
sum(intervention_test.ORIGINE_INCIDENT.isnull())

68

## Recreate target and complete training data with history

In [10]:
intervention_history.columns

Index(['INSTANCE_ID', 'INCIDENT_NUMBER', 'DATE_SAISIE_RETOUR',
       'INCIDENT_TYPE_ID', 'INCIDENT_TYPE_NAME', 'INCIDENT_STATUS_ID',
       'INCIDENT_STATUS_NAME', 'INCIDENT_STATUS_CODE', 'INCIDENT_CLOSED_FLAG',
       'TYPE_BI', 'NB_PASSAGE', 'MILLESIME', 'PROBLEM_CODE', 'PROBLEM_DESC',
       'MOTIF_A_REVOIR', 'AUTEUR_INCIDENT', 'ORIGINE_INCIDENT',
       'COMMENTAIRE_BI', 'SS_TR_FLAG', 'TYPE_UT', 'GRAVITE',
       'COMMENTAIRE_FIN_INTERV', 'MOTIF_ANNULATION_CODE',
       'MOTIF_ANNULATION_DESC', 'RESOURCE_ID', 'SCHEDULED_START_DATE',
       'SCHEDULED_END_DATE', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE',
       'CRE_DATE_GZL'],
      dtype='object')

In [7]:
# delete canceled appointments from the training ?
# TODO: put a boolean in a preprocessing to do so or not
intervention_history.sort_values(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [8]:
groups = intervention_history.groupby('INSTANCE_ID')

In [9]:
get_time_delta = lambda s: s.diff(1).shift(-1)

In [18]:
#time_delta = groups.CRE_DATE_GZL.transform(get_time_delta)

In [19]:
#intervention_history['time_delta'] = time_delta

In [20]:
#intervention_history.reset_index(inplace=True)

In [11]:
intervention_history['time_delta'] = groups.ACTUAL_START_DATE.transform(get_time_delta)

In [14]:
intervention_history['next_incident_type'] = groups.INCIDENT_TYPE_NAME.transform(lambda s: s.shift(-1))

In [23]:
intervention_history['target_test'] = intervention_history['time_delta'].between(pd.Timedelta('1 days'), pd.Timedelta('182 days')) \
                                        & (intervention_history['next_incident_type'] != 'Entretien') \
                                        & intervention_history['MOTIF_ANNULATION_CODE'].isnull()

In [29]:
intervention_train.reset_index(inplace=True)
intervention_train.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [30]:
intervention_history.reset_index(inplace=True)
intervention_history.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [31]:
intervention_train.sort_index(inplace=True)
intervention_history.sort_index(inplace=True)

In [32]:
history_with_target = intervention_history.join(intervention_train['target'], how='left')

In [36]:
history_with_target.loc[~history_with_target.target.isnull(), ['target', 'target_test','time_delta', 'next_incident_type', 'MOTIF_ANNULATION_CODE', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE', 'INCIDENT_TYPE_NAME', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE',  'INCIDENT_STATUS_NAME', 'TYPE_BI', 'MOTIF_ANNULATION_CODE', 'MOTIF_ANNULATION_DESC']][50:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,target_test,time_delta,next_incident_type,MOTIF_ANNULATION_CODE,ACTUAL_START_DATE,ACTUAL_END_DATE,INCIDENT_TYPE_NAME,SCHEDULED_START_DATE,SCHEDULED_END_DATE,INCIDENT_STATUS_NAME,TYPE_BI,MOTIF_ANNULATION_CODE,MOTIF_ANNULATION_DESC
INSTANCE_ID,CRE_DATE_GZL,INCIDENT_NUMBER,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10135,2017-09-04,87371802,1.0,True,10 days,Dépannage,,2017-09-05,2017-09-05,Dépannage,2017-09-05,2017-09-05,3-A revoir,Dépannage,,
10135,2017-09-11,87530552,0.0,False,179 days,Entretien,,2017-09-15,2017-09-15,Dépannage,2017-09-15,2017-09-15,1-Réalisé,Dépannage,,
10145,2016-03-01,71826460,0.0,False,NaT,Entretien,A,NaT,NaT,Entretien,2016-04-05,2016-04-05,7-Annulé Savelys,Entretien,A,Annulation RDV
10145,2016-03-31,72726524,0.0,False,NaT,,,2016-05-02,2016-05-02,Entretien,2016-05-02,2016-05-02,1-Réalisé,Entretien,,
10149,2015-05-15,63191770,0.0,False,NaT,,,2015-06-18,2015-06-18,Entretien,2015-06-18,2015-06-18,1-Réalisé,Entretien,,
10150,2016-05-12,73912140,1.0,True,141 days,Dépannage,,2016-06-15,2016-06-15,Entretien,2016-06-15,2016-06-15,1-Réalisé,Entretien,,
10150,2016-11-02,78499460,0.0,False,200 days,Entretien,,2016-11-03,2016-11-03,Dépannage,2016-11-03,2016-11-03,1-Réalisé,Dépannage,,
10150,2017-04-19,83929378,1.0,True,52 days,Dépannage,,2017-05-22,2017-05-22,Entretien,2017-05-22,2017-05-22,1-Réalisé,Entretien,,
10150,2017-07-13,86270628,1.0,True,2 days,Dépannage,,2017-07-13,2017-07-13,Dépannage,2017-07-13,2017-07-13,3-A revoir,Dépannage,,
10150,2017-07-13,86283522,1.0,False,NaT,,,2017-07-15,2017-07-15,Dépannage,2017-07-15,2017-07-15,3-A revoir,Dépannage,,


Target test is not perfect but good enough for the moment.<br>
No need to spend more time on it as long as I am not sure I can use history (i.e. do other tables only have obs from training or the full data ?)

If improvment is needed: 
- delete last obs of each instance_id
- check for better handling of cancelled obs (mb take them out before computing time delta)

## Check if obs from other tables are in train / test / history
Yes, those tables are in train, test and history <br>
That means we can create features on them and use history to train on more datas

In [51]:
print(len(set(equipment.INSTANCE_ID) & set(intervention_train.INSTANCE_ID)))
print(len(set(equipment.INSTANCE_ID) & set(intervention_test.INSTANCE_ID)))
print(len(set(equipment.INSTANCE_ID) & set(intervention_history.INSTANCE_ID)))

462663
226770
795394


In [52]:
print(len(set(contract_history.INSTANCE_ID) & set(intervention_train.INSTANCE_ID)))
print(len(set(contract_history.INSTANCE_ID) & set(intervention_test.INSTANCE_ID)))
print(len(set(contract_history.INSTANCE_ID) & set(intervention_history.INSTANCE_ID)))

462663
221930
793033


In [54]:
print(len(set(intervention_train.INSTANCE_ID)))
print(len(set(intervention_test.INSTANCE_ID)))
print(len(set(intervention_history.INSTANCE_ID)))

462663
226770
795394


## merge everything

In [None]:
# intervention
# equipment
# organization



### Equipment

In [55]:
equipment.head()

Unnamed: 0,INSTANCE_ID,LOCATION_ID,ORGANISATION_ID,PARTY_ID_OCC,TYPE_OCC,INSTALL_DATE,RACHAT_CODE,RACHAT_LIB,RACHAT_DATE,NATURE_CODE,...,TYPE_VOIE,VILLE,POINTS_FIDEL,STOP_PHONING,CODE_GEN_EQUIPEMENT,CODE_FONCTION,CODE_ENERGIE,CODE_INSTALLATION,CODE_SPECIFICATION,CODE_EAU_CHAUDE
0,1240047,8227277,1143,1644459,PROPRIETAIRE,2003-02-01,,,,G-1-C-T-2,...,AV,SOLLIES PONT,1000.0,Y,G1,G,1.0,C,T,2
1,1240050,8227956,1143,1644453,PROPRIETAIRE,2007-06-01,,,,G-1-C-I-1,...,AV,SOLLIES PONT,1000.0,,G1,G,1.0,C,I,1
2,1240051,8234246,1143,1644840,PROPRIETAIRE,2001-06-01,,,,G-1-C-I-3,...,TRA,TOULON,1000.0,,G1,G,1.0,C,I,3
3,1240275,8235019,1143,1652047,PROPRIETAIRE,2003-11-01,,,,G-1-C-H-1,...,RUE,LA SEYNE SUR MER,1000.0,,G1,G,1.0,C,H,1
4,1240277,8234808,1143,1651224,PROPRIETAIRE,1996-07-01,,,,G-1-C-H-1,...,BD,LA SEYNE SUR MER,1000.0,,G1,G,1.0,C,H,1


In [56]:
equipment.columns

Index(['INSTANCE_ID', 'LOCATION_ID', 'ORGANISATION_ID', 'PARTY_ID_OCC',
       'TYPE_OCC', 'INSTALL_DATE', 'RACHAT_CODE', 'RACHAT_LIB', 'RACHAT_DATE',
       'NATURE_CODE', 'MARQUE_CODE', 'MARQUE_LIB', 'MODELE_CODE', 'MODELE_LIB',
       'USAGE_LOCAL', 'LOCALISATION_ORGANISME', 'COMPLEMENT_RUE',
       'CODE_POSTAL', 'ESCALIER', 'ETAGE', 'NUMERO', 'RUE', 'PAYS',
       'TYPE_VOIE', 'VILLE', 'POINTS_FIDEL', 'STOP_PHONING',
       'CODE_GEN_EQUIPEMENT', 'CODE_FONCTION', 'CODE_ENERGIE',
       'CODE_INSTALLATION', 'CODE_SPECIFICATION', 'CODE_EAU_CHAUDE'],
      dtype='object')

In [58]:
useless_variables = ['CODE_POSTAL', 'ETAGE', 'PAYS', 'POINTS_FIDEL', 'STOP_PHONING']
equipment.drop(useless_variables, axis=1, inplace=True)

### Contract
Some aggregation should be used in feature ingineering

In [85]:
useless_variables = [ 'CONTRACT_NUMBER', 'CONTRACT_MODIFICATEUR', 'CRE_DATE', 
       'CONDITION_REGLEMENT', 'MOTIF_RESILIATION', 'RENOUVELLEMENT_AGENCE',
       'PRIX_FORMULE', 'PRIX_OPTION', 'NUM_CAMPAGNE', 'DATE_RESILIATION']
contract_history.drop(useless_variables, axis=1, inplace=True)

In [74]:
contract_history.head(50)

Unnamed: 0,STS_CODE,UPD_DATE,DATE_RESILIATION,DATE_DEBUT,DATE_FIN,INSTANCE_ID,FORMULE,OPTION,CONTRAT_TARIF,PRIX_FACTURE
0,TERMINATED,2015-10-12 12:26:24,2015-04-10 00:00:00,2015-04-10 00:00:00,2016-04-09 00:00:00,15447048,SECURITE*,,77,77
1,TERMINATED,2015-03-19 04:12:36,2015-03-19 00:00:00,2015-03-19 00:00:00,2016-03-18 00:00:00,13172466,SECURITE*,,164,253
2,EXPIRED,2016-01-01 02:43:12,,2015-01-01 00:00:00,2015-12-31 00:00:00,37101538,SECURITE*,,162,0
3,TERMINATED,2015-03-21 04:17:27,2015-03-20 00:00:00,2015-03-24 00:00:00,2016-03-23 00:00:00,2028002,INTEGRAL***,,242,259
4,ACTIVE_KI,2015-05-10 00:16:44,,2015-03-24 00:00:00,2016-03-23 00:00:00,8774167,INTEGRAL***,,242,242
5,EXPIRED,2016-03-24 00:33:26,,2015-03-24 00:00:00,2016-03-23 00:00:00,8774167,INTEGRAL***,,242,242
6,TERMINATED,2015-03-01 05:24:54,2015-03-01 00:00:00,2015-03-01 00:00:00,2016-02-29 00:00:00,2923109,SECURITE*,RAMONAGE FIOUL,252,195
7,ACTIVE_KI,2015-09-08 04:08:18,,2015-09-08 00:00:00,2016-09-07 00:00:00,37253750,SECURITE*,,142,0
8,EXPIRED,2016-09-08 01:39:00,,2015-09-08 00:00:00,2016-09-07 00:00:00,37253750,SECURITE*,,142,0
9,TERMINATED,2015-03-29 03:07:21,2015-03-29 00:00:00,2015-03-29 00:00:00,2016-03-28 00:00:00,1933995,SECURITE*,,164,204


In [88]:
contract_history.isnull().sum()

STS_CODE         0
UPD_DATE         0
DATE_DEBUT       0
DATE_FIN         0
INSTANCE_ID      0
FORMULE          0
OPTION           0
CONTRAT_TARIF    0
PRIX_FACTURE     0
dtype: int64

In [87]:
contract_history.OPTION.fillna("NO", inplace=True)

In [89]:
contract_history.STS_CODE.value_counts()

ACTIVE_KI          1640274
FUTUR_RENOUVELE    1311850
EXPIRED             874995
TERMINATED          652049
PROPOSED            148072
SIGNED               52239
FUTUR                41413
QA_HOLD                109
HOLD                    95
Name: STS_CODE, dtype: int64

In [90]:
# find appropriate contract at the moment of the call
groups = contract_history[['INSTANCE_ID', 'UPD_DATE']].merge(intervention_train[['INSTANCE_ID', 'CRE_DATE_GZL']])

In [92]:
groups = groups.loc[groups.UPD_DATE <=  groups.CRE_DATE_GZL, :].groupby(['INSTANCE_ID', 'CRE_DATE_GZL'])

In [95]:
update_date = groups.max()

In [100]:
intervention_train = intervention_train.merge(update_date, left_on=['INSTANCE_ID', 'CRE_DATE_GZL'], right_index=True)

In [None]:
intervention_train = intervention_train.merge(contract_history, on='UPD_DATE')

In [None]:
# feature ingeneering:
# dates: compute timedelta between update_date and creation_date
# computation timedelta between creation_date and creation_date

### Organisation

Probably useless ...

In [65]:
organisation.columns

Index(['L1_ORGANISATION_ID', 'L1_NAME', 'L2_ORGANISATION_ID', 'L2_NAME',
       'ADRESSE', 'L2_ORGA_CODE_POSTAL', 'L2_ORGA_VILLE', 'CIA',
       'ORGANISATION_CODE'],
      dtype='object')

In [None]:
useless_variables = ['L1_ORGANISATION_ID', 'L2_ORGANISATION_ID', 'CIA']
equipment.drop(useless_variables, axis=1, inplace=True)

## merge

In [None]:
def prepare_data(data_):
    data = data_.merge(equipement_df, how='left', on='INSTANCE_ID').merge(orga_df, how='left', left_on='ORGANISATION_ID', right_on='L2_ORGANISATION_ID')
    contrat_history_s = data[['INCIDENT_NUMBER', 'INSTANCE_ID','CRE_DATE_GZL']].merge(contrat_history_df).query('CRE_DATE_GZL>=UPD_DATE_date')
    contrat_history_s = contrat_history_s.sort_values(['INCIDENT_NUMBER', 'UPD_DATE'], ascending=[True,False]).drop_duplicates(keep='first', subset=['INCIDENT_NUMBER'])
    data = data.merge(contrat_history_s, how='left').merge(nature_code_eau_chaude, how='left')
    data = data.merge(nature_code_energie, how='left').merge(nature_code_fonction, how='left')
    data = data.merge(nature_code_installation, how='left').merge(nature_code_specification, how='left')
    print(data.shape[0], data.INCIDENT_NUMBER.nunique(), data.INSTANCE_ID.nunique())
    return data

In [None]:
drop_columns = ['CRE_DATE_date', 'UPD_DATE_date']

In [None]:
train_data = prepare_data(intervention_train_df).drop(drop_columns, axis=1)

In [None]:
test_data = prepare_data(intervention_test_df).drop(drop_columns, axis=1)

In [None]:
history_data = prepare_data(intervention_history_df).drop(drop_columns, axis=1)

## Check unclear variables

In [66]:
intervention_train.columns

Index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER', 'index',
       'INCIDENT_TYPE_ID', 'INCIDENT_TYPE_NAME', 'TYPE_BI', 'NB_PASSAGE',
       'MILLESIME', 'PROBLEM_CODE', 'PROBLEM_DESC', 'AUTEUR_INCIDENT',
       'ORIGINE_INCIDENT', 'COMMENTAIRE_BI', 'SS_TR_FLAG', 'TYPE_UT',
       'GRAVITE', 'RESOURCE_ID', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE',
       'target'],
      dtype='object')

## Univariate analysis

In [None]:
df.target.value_counts()


In [None]:
sns.countplot(x="deck", data=titanic, palette="Greens_d");

## Bivariate analysis

In [None]:
sns.pairplot(intervention_train, hue='target')

In [None]:
sns.barplot(x="MILLESIME", y="target", data=df)

In [None]:
sns.barplot(x="MILLESIME", y="target", hue="PROBLEM_CODE", data=df)

In [None]:
#type BI et type UT are the same
pd.crosstab(train.TYPE_BI, train.TYPE_UT)

In [None]:
#CIA, ORGANISATION_CODE and L2_organization_id are the same
pd.crosstab(train.CIA, train.L2_ORGANISATION_ID)

## first model

In [None]:
# preprocessing for categorical variables and dates

In [None]:
# svm

## build output file