In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import pickle

## Import data

In [5]:
data_repo = "/home/julien/src/DSC_HOME_SERVICE/data/"
IMPORT_PARAMS = {'sep': '|', 'encoding': 'latin-1'}

In [6]:
def import_contract_history(file):
    date_cols = ['CRE_DATE', 'UPD_DATE', 'DATE_RESILIATION', 'DATE_DEBUT', 'DATE_FIN']
    contract_history = pd.read_csv(file, dtype={'NUM_CAMPAGNE': object}, parse_dates=date_cols, **IMPORT_PARAMS)

    
    for c in ['CRE_DATE', 'UPD_DATE']:
        contract_history['{}_date'.format(c)] = pd.to_datetime(contract_history[c].map(lambda x: datetime.datetime.date(x)))
    
    contract_history['NUM_CAMPAGNE'] = contract_history['NUM_CAMPAGNE'].map(lambda x: x if x not in ['N', ''] else "NAN")
    contract_history.OPTION.fillna("NO", inplace=True)
    
    useless_variables = [ 'CONTRACT_NUMBER', 'CONTRACT_MODIFICATEUR', 'CRE_DATE', 
                       'CONDITION_REGLEMENT', 'MOTIF_RESILIATION', 'RENOUVELLEMENT_AGENCE',
                       'PRIX_FORMULE', 'PRIX_OPTION', 'NUM_CAMPAGNE', 'DATE_RESILIATION']
    
    contract_history.drop(useless_variables, axis=1, inplace=True)
    
    return contract_history

def import_equipment(file):
    return pd.read_csv(file, dtype={'CODE_POSTAL': object}, parse_dates=['INSTALL_DATE', 'RACHAT_DATE'], **IMPORT_PARAMS)

def import_organisation(file):
    organisation = pd.read_csv(file, **IMPORT_PARAMS)
    useless_variables = ['L1_NAME', 'L2_NAME', 'ADRESSE']
    organisation.drop(useless_variables, axis=1, inplace=True)
    return organisation

def preproc_intervention(data):
    data.NB_PASSAGE.fillna(1, inplace=True)
    data.PROBLEM_CODE.fillna('NC', inplace=True)
    data.ORIGINE_INCIDENT.fillna('NAN', inplace=True)
    data.COMMENTAIRE_BI.fillna('.', inplace=True)
    data.RESOURCE_ID.fillna(0, inplace=True)
    
    missing_millesime = data.MILLESIME.isnull()
    data.loc[missing_millesime, 'MILLESIME'] = data.loc[missing_millesime, 'CRE_DATE_GZL'].apply(lambda x: x.year)
    
    data.drop('PROBLEM_DESC', axis=1, inplace=True)
    data.drop_duplicates(inplace=True)
    
    data['MILLESIME'] = data['MILLESIME'].astype(str)
    data['RESOURCE_ID'] = data['RESOURCE_ID'].astype(str)

def merge_data(data_, 
               equipement_df, 
               orga_df, 
               contrat_history_df, 
               nature_code_eau_chaude, 
               nature_code_energie, 
               nature_code_fonction, 
               nature_code_installation, 
               nature_code_specification):
    
    data = data_.merge(equipement_df, how='left', on='INSTANCE_ID').merge(orga_df, how='left', left_on='ORGANISATION_ID', right_on='L2_ORGANISATION_ID')
    contrat_history_s = data[['INCIDENT_NUMBER', 'INSTANCE_ID','CRE_DATE_GZL']].merge(contrat_history_df).query('CRE_DATE_GZL>=UPD_DATE_date')
    contrat_history_s = contrat_history_s.sort_values(['INCIDENT_NUMBER', 'UPD_DATE'], ascending=[True,False]).drop_duplicates(keep='first', subset=['INCIDENT_NUMBER'])
    data = data.merge(contrat_history_s, how='left').merge(nature_code_eau_chaude, how='left')
    data = data.merge(nature_code_energie, how='left').merge(nature_code_fonction, how='left')
    data = data.merge(nature_code_installation, how='left').merge(nature_code_specification, how='left')
    
    drop_columns = ['CRE_DATE_date', 'UPD_DATE_date']
    data.drop(drop_columns, axis=1)
    
    print(data.shape[0], data.INCIDENT_NUMBER.nunique(), data.INSTANCE_ID.nunique())
    return data

def import_all(file, history=False):
    contract_histo = import_contract_history(data_repo + 'contract_history.csv')
    equipment = import_equipment(data_repo + 'equipment.csv')
    organisation = import_organisation(data_repo + 'organisation.csv')
    
    code_eau_chaude = pd.read_csv(data_repo + 'nature_code_eau_chaude.csv', sep='|')
    code_energie = pd.read_csv(data_repo + 'nature_code_energie.csv', sep='|')
    code_fonction = pd.read_csv(data_repo + 'nature_code_fonction.csv', sep='|')
    code_installation = pd.read_csv(data_repo + 'nature_code_installation.csv', sep='|')
    code_specification = pd.read_csv(data_repo + 'nature_code_specification.csv', sep='|')
    
    date_cols = ['CRE_DATE_GZL', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE']
    
    if history: 
        merged_list = []
        chunksize = 10 ** 6
        date_cols += ['ACTUAL_START_DATE', 'ACTUAL_END_DATE']
        
        for chunk in pd.read_csv(file, sep='|', encoding='latin-1', parse_dates=date_cols, chunksize=chunksize):
            preproc_intervention(chunk)
        
            merged_list.append(merge_data(chunk, 
                                          equipment, 
                                          organisation, 
                                          contract_histo,
                                          code_eau_chaude,
                                          code_energie,
                                          code_fonction,
                                          code_installation,
                                          code_specification))
    
        return pd.concat(merged_list)
    
    else:    
        intervention = pd.read_csv(file, sep='|', encoding='latin-1', parse_dates=date_cols)
        preproc_intervention(intervention)

        return merge_data(intervention, 
                          equipment, 
                          organisation, 
                          contract_histo,
                          code_eau_chaude,
                          code_energie,
                          code_fonction,
                          code_installation,
                          code_specification)

In [25]:
train = import_all(data_repo + 'intervention_train.csv')

1048030 1048030 462663


In [26]:
train.to_csv('../data/merged_data/train.csv')

In [24]:
test = import_all(data_repo + 'intervention_test.csv')

226770 226770 226770


In [25]:
test.to_csv('../data/merged_data/test.csv')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['INCIDENT_TYPE_NAME', 'TYPE_BI', 'MILLESIME', 'PROBLEM_CODE', 'ORIGINE_INCIDENT', 'COMMENTAIRE_BI', 'SS_TR_FLAG', 'TYPE_UT', 'GRAVITE', 'RESOURCE_ID', 'TYPE_OCC', 'RACHAT_CODE', 'RACHAT_LIB', 'NATURE_CODE', 'MARQUE_LIB', 'MODELE_CODE', 'MODELE_LIB', 'USAGE_LOCAL', 'LOCALISATION_ORGANISME', 'COMPLEMENT_RUE', 'CODE_POSTAL', 'ESCALIER', 'ETAGE', 'NUMERO', 'RUE', 'PAYS', 'TYPE_VOIE', 'VILLE', 'STOP_PHONING', 'CODE_GEN_EQUIPEMENT', 'CODE_FONCTION', 'CODE_ENERGIE', 'CODE_INSTALLATION', 'CODE_SPECIFICATION', 'CODE_EAU_CHAUDE', 'L2_ORGA_VILLE', 'ORGANISATION_CODE', 'STS_CODE', 'FORMULE', 'OPTION', 'EAU_CHAUDE', 'ENERGIE', 'FONCTION', 'INSTALLATION', 'SPECIFICATION']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
#Memory error, try to improve code or do it in several slices
history = import_all(data_repo + 'intervention_history.csv', history=True)

1000000 1000000 526538
1000000 1000000 526139
1000000 1000000 526484
1000000 1000000 526741
1000000 1000000 526670
171308 171308 150181


In [11]:
history.to_csv('../data/merged_data/history.csv')

In [47]:
history_extract = history.iloc[:100000]

In [22]:
history_extract = create_target_for_history(history_extract)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [53]:
history_extract = add_train_target_value_in_history(history_extract, train)

In [54]:
history_extract.head(30)

Unnamed: 0,INSTANCE_ID,CRE_DATE_GZL,INCIDENT_NUMBER,DATE_SAISIE_RETOUR,INCIDENT_TYPE_ID,INCIDENT_TYPE_NAME,INCIDENT_STATUS_ID,INCIDENT_STATUS_NAME,INCIDENT_STATUS_CODE,INCIDENT_CLOSED_FLAG,...,CONTRAT_TARIF,PRIX_FACTURE,CRE_DATE_date,UPD_DATE_date,EAU_CHAUDE,ENERGIE,FONCTION,INSTALLATION,SPECIFICATION,target
0,10021,2017-03-30,83345306,2017-04-04,10005,Entretien,2,1-Réalisé,CLOSED,Y,...,300.0,281.0,2017-01-14,2017-03-20,ACCUMULEE BALLON INTEGRE,GAZ NAT,CHAUDIERE MURALE,-,VENTOUSE HORIZONTALE,0.0
1,10033,2016-02-16,71417672,2016-02-22,10004,Dépannage+Entretien,115,9-A revoir - VE faite,,Y,...,,,NaT,NaT,INSTANTANEE,GAZ NAT,CHAUDIERE MURALE,-,VENTOUSE HORIZONTALE,
2,10066,2014-02-03,49753080,2014-02-05,10005,Entretien,115,9-A revoir - VE faite,,Y,...,,,NaT,NaT,MICRO ACCUMULEE,GAZ NAT,CHAUDIERE MURALE,-,VENTOUSE HORIZONTALE,
3,10078,2015-04-16,62433670,2015-04-17,10005,Entretien,101,7-Annulé Savelys,CLEAR,Y,...,,,NaT,NaT,MICRO ACCUMULEE,GAZ NAT,CHAUDIERE MURALE,-,TIRAGE NATUREL,
4,10111,2014-10-10,56489162,2014-11-03,10005,Entretien,2,1-Réalisé,CLOSED,Y,...,,,NaT,NaT,INSTANTANEE,GAZ NAT,CHAUFFE-BAIN,-,TIRAGE NATUREL,
5,10123,2015-07-01,64572422,2015-07-07,10009,Mise en service,2,1-Réalisé,CLOSED,Y,...,,,NaT,NaT,,GAZ NAT,ACCUMULATEUR / BALLON,-,TIRAGE NATUREL,
6,10135,2018-03-12,93443044,2018-03-22,10005,Entretien,2,1-Réalisé,CLOSED,Y,...,256.0,282.0,2017-12-10,2018-02-10,ACCUMULEE BALLON INTEGRE,GAZ NAT,CHAUDIERE MURALE,-,TIRAGE NATUREL,
7,10151,2014-05-07,52485420,2014-06-06,10005,Entretien,2,1-Réalisé,CLOSED,Y,...,,,NaT,NaT,INSTANTANEE,GAZ NAT,CHAUDIERE MURALE,-,VENTOUSE HORIZONTALE,
8,10161,2014-04-29,52302668,2014-05-02,10003,Dépannage,2,1-Réalisé,CLOSED,Y,...,,,NaT,NaT,INSTANTANEE,GAZ NAT,CHAUDIERE MURALE,CONDENSATION,VENTOUSE HORIZONTALE,
9,10162,2015-05-21,63361706,2015-05-26,10003,Dépannage,51,3-A revoir,WAITING,Y,...,,,NaT,NaT,INSTANTANEE,GAZ NAT,CHAUDIERE MURALE,-,TIRAGE NATUREL,


In [55]:
'target' in history_extract.columns

True

## Recreate targetby training value when possible and computed value otherwize

In [52]:
def create_target_for_history(history):
    history.sort_values(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)
    groups = history.groupby('INSTANCE_ID')
    
    get_time_delta = lambda s: s.diff(1).shift(-1)
    history['time_delta'] = groups.ACTUAL_START_DATE.transform(get_time_delta)
    history['next_incident_type'] = groups.INCIDENT_TYPE_NAME.transform(lambda s: s.shift(-1))
    history['target_test'] = history['time_delta'].between(pd.Timedelta('1 days'), pd.Timedelta('182 days')) \
                                        & (history['next_incident_type'] != 'Entretien') \
                                        & history['MOTIF_ANNULATION_CODE'].isnull()
            
    return history

def add_train_target_value_in_history(history, train):
    history_copy = history.copy()
    train_copy = train.copy()
    
    #train_copy.reset_index(inplace=True)
    train_copy.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

    #history_copy.reset_index(inplace=True)
    history_copy.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

    train_copy.sort_index(inplace=True)
    history_copy.sort_index(inplace=True)

    history_copy = history_copy.join(train_copy['target'], how='left')
    history_copy.reset_index(inplace=True)
    return history_copy

In [7]:
# delete canceled appointments from the training ?
# TODO: put a boolean in a preprocessing to do so or not
intervention_history.sort_values(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

In [8]:
groups = intervention_history.groupby('INSTANCE_ID')

In [9]:
get_time_delta = lambda s: s.diff(1).shift(-1)

In [18]:
#time_delta = groups.CRE_DATE_GZL.transform(get_time_delta)

In [19]:
#intervention_history['time_delta'] = time_delta

In [20]:
#intervention_history.reset_index(inplace=True)

In [11]:
intervention_history['time_delta'] = groups.ACTUAL_START_DATE.transform(get_time_delta)

In [14]:
intervention_history['next_incident_type'] = groups.INCIDENT_TYPE_NAME.transform(lambda s: s.shift(-1))

In [23]:
intervention_history['target_test'] = intervention_history['time_delta'].between(pd.Timedelta('1 days'), pd.Timedelta('182 days')) \
                                        & (intervention_history['next_incident_type'] != 'Entretien') \
                                        & intervention_history['MOTIF_ANNULATION_CODE'].isnull()

In [29]:
intervention_train.reset_index(inplace=True)
intervention_train.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

intervention_history.reset_index(inplace=True)
intervention_history.set_index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER'], inplace=True)

intervention_train.sort_index(inplace=True)
intervention_history.sort_index(inplace=True)

history_with_target = intervention_history.join(intervention_train['target'], how='left')

In [36]:
history_with_target.loc[~history_with_target.target.isnull(), ['target', 'target_test','time_delta', 'next_incident_type', 'MOTIF_ANNULATION_CODE', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE', 'INCIDENT_TYPE_NAME', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE',  'INCIDENT_STATUS_NAME', 'TYPE_BI', 'MOTIF_ANNULATION_CODE', 'MOTIF_ANNULATION_DESC']][50:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,target_test,time_delta,next_incident_type,MOTIF_ANNULATION_CODE,ACTUAL_START_DATE,ACTUAL_END_DATE,INCIDENT_TYPE_NAME,SCHEDULED_START_DATE,SCHEDULED_END_DATE,INCIDENT_STATUS_NAME,TYPE_BI,MOTIF_ANNULATION_CODE,MOTIF_ANNULATION_DESC
INSTANCE_ID,CRE_DATE_GZL,INCIDENT_NUMBER,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10135,2017-09-04,87371802,1.0,True,10 days,Dépannage,,2017-09-05,2017-09-05,Dépannage,2017-09-05,2017-09-05,3-A revoir,Dépannage,,
10135,2017-09-11,87530552,0.0,False,179 days,Entretien,,2017-09-15,2017-09-15,Dépannage,2017-09-15,2017-09-15,1-Réalisé,Dépannage,,
10145,2016-03-01,71826460,0.0,False,NaT,Entretien,A,NaT,NaT,Entretien,2016-04-05,2016-04-05,7-Annulé Savelys,Entretien,A,Annulation RDV
10145,2016-03-31,72726524,0.0,False,NaT,,,2016-05-02,2016-05-02,Entretien,2016-05-02,2016-05-02,1-Réalisé,Entretien,,
10149,2015-05-15,63191770,0.0,False,NaT,,,2015-06-18,2015-06-18,Entretien,2015-06-18,2015-06-18,1-Réalisé,Entretien,,
10150,2016-05-12,73912140,1.0,True,141 days,Dépannage,,2016-06-15,2016-06-15,Entretien,2016-06-15,2016-06-15,1-Réalisé,Entretien,,
10150,2016-11-02,78499460,0.0,False,200 days,Entretien,,2016-11-03,2016-11-03,Dépannage,2016-11-03,2016-11-03,1-Réalisé,Dépannage,,
10150,2017-04-19,83929378,1.0,True,52 days,Dépannage,,2017-05-22,2017-05-22,Entretien,2017-05-22,2017-05-22,1-Réalisé,Entretien,,
10150,2017-07-13,86270628,1.0,True,2 days,Dépannage,,2017-07-13,2017-07-13,Dépannage,2017-07-13,2017-07-13,3-A revoir,Dépannage,,
10150,2017-07-13,86283522,1.0,False,NaT,,,2017-07-15,2017-07-15,Dépannage,2017-07-15,2017-07-15,3-A revoir,Dépannage,,


Target test is not perfect but good enough for the moment.<br>
No need to spend more time on it as long as I am not sure I can use history (i.e. do other tables only have obs from training or the full data ?)

If improvment is needed: 
- delete last obs of each instance_id
- check for better handling of cancelled obs (mb take them out before computing time delta)

### Equipment

In [58]:
useless_variables = ['CODE_POSTAL', 'ETAGE', 'PAYS', 'POINTS_FIDEL', 'STOP_PHONING']
equipment.drop(useless_variables, axis=1, inplace=True)

In [None]:
# feature ingeneering on contracts:
# dates: compute timedelta between update_date and creation_date
# computation timedelta between creation_date and creation_date

## Check unclear variables

In [66]:
intervention_train.columns

Index(['INSTANCE_ID', 'CRE_DATE_GZL', 'INCIDENT_NUMBER', 'index',
       'INCIDENT_TYPE_ID', 'INCIDENT_TYPE_NAME', 'TYPE_BI', 'NB_PASSAGE',
       'MILLESIME', 'PROBLEM_CODE', 'PROBLEM_DESC', 'AUTEUR_INCIDENT',
       'ORIGINE_INCIDENT', 'COMMENTAIRE_BI', 'SS_TR_FLAG', 'TYPE_UT',
       'GRAVITE', 'RESOURCE_ID', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE',
       'target'],
      dtype='object')

## Univariate analysis

In [None]:
df.target.value_counts()


In [None]:
sns.countplot(x="target", data=train_data, palette="Greens_d");

## Bivariate analysis

In [None]:
sns.pairplot(intervention_train, hue='target')

In [None]:
sns.barplot(x="MILLESIME", y="target", data=df)

In [None]:
sns.barplot(x="MILLESIME", y="target", hue="PROBLEM_CODE", data=df)

In [6]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score([1, 0, 0, 1, 1], [0.4, 0.1, 0.1, 0.6, 0.7]))

print(roc_auc_score([1, 0, 0, 1, 1], [1, 0, 0, 1, 1]))



1.0
1.0
