In [2]:
import pandas as pd
import numpy as np
import re
import datetime
import csv

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [4]:
pd.set_option("display.max_columns", 70)
pd.set_option("display.max_rows", 400)

## Chargement des données

In [5]:
orga_df = pd.read_csv('data/cleaned/organisation.csv',
                      sep='|',
                      encoding='Latin-1')

In [6]:
equipement_df = pd.read_csv('data/cleaned/equipment.csv',
                           sep='|',
                           encoding='Latin-1', parse_dates=['INSTALL_DATE', 'RACHAT_DATE'])

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
contrat_history_df = pd.read_csv('data/processed/contract_history.csv',
                           sep='|',
                           encoding='Latin-1', parse_dates=['CRE_DATE', 'UPD_DATE', 'DATE_RESILIATION', 'DATE_DEBUT', 'DATE_FIN'])

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
for c in ['CRE_DATE', 'UPD_DATE']:
    contrat_history_df['{}_date'.format(c)] = pd.to_datetime(contrat_history_df[c].map(lambda x: datetime.datetime.date(x)))

In [11]:
contrat_instance = contrat_history_df[['CONTRACT_NUMBER', 'INSTANCE_ID']].drop_duplicates()

In [12]:
intervention_test_df = pd.read_csv('data/final/intervention_test.csv',                              
                           sep='|',
                           encoding='Latin-1',
                           parse_dates=['SCHEDULED_START_DATE', 'SCHEDULED_END_DATE', 'CRE_DATE_GZL'])
print(intervention_test_df.shape, intervention_test_df.INCIDENT_NUMBER.nunique(), intervention_test_df.INSTANCE_ID.nunique())

(226770, 19) 226770 226770


In [13]:
intervention_train_df = pd.read_csv('data/final/intervention_train.csv',                              
                           sep='|',
                           encoding='Latin-1',
                           parse_dates=['SCHEDULED_START_DATE', 'SCHEDULED_END_DATE', 'CRE_DATE_GZL'])
print(intervention_train_df.shape, intervention_train_df.INCIDENT_NUMBER.nunique(), intervention_train_df.INSTANCE_ID.nunique())

(1048030, 20) 1048030 462663


In [14]:
intervention_history_df = pd.read_csv('data/processed/intervention_history.csv',                              
                           sep='|',
                           encoding='Latin-1',
                           parse_dates=['DATE_SAISIE_RETOUR', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE', 'CRE_DATE_GZL'])
print(intervention_history_df.shape, intervention_history_df.INCIDENT_NUMBER.nunique(), intervention_history_df.INSTANCE_ID.nunique())

(5171308, 30) 5171308 795394


In [15]:
test_target = pd.read_csv('data/final/private_test_target.csv', sep='|',  encoding='Latin-1')

In [16]:
nature_code_eau_chaude = pd.read_csv('data/final/nature_code_eau_chaude.csv', sep='|', encoding='Latin-1')

In [17]:
nature_code_energie = pd.read_csv('data/final/nature_code_energie.csv', sep='|', encoding='Latin-1')

In [18]:
nature_code_fonction = pd.read_csv('data/final/nature_code_fonction.csv', sep='|', encoding='Latin-1')

In [19]:
nature_code_installation = pd.read_csv('data/final/nature_code_installation.csv', sep='|', encoding='Latin-1')

In [20]:
nature_code_specification = pd.read_csv('data/final/nature_code_specification.csv', sep='|', encoding='Latin-1')

# Préparation des X_test et X_train

In [21]:
weekf = lambda x: x.isocalendar()[1]
dayf = lambda x: x.isocalendar()[2]

In [22]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [23]:
def prepare_data(data_):
    data = data_.merge(equipement_df, how='left', on='INSTANCE_ID').merge(orga_df, how='left', left_on='ORGANISATION_ID', right_on='L2_ORGANISATION_ID')
    contrat_history_s = data[['INCIDENT_NUMBER', 'INSTANCE_ID','CRE_DATE_GZL']].merge(contrat_history_df).query('CRE_DATE_GZL>=UPD_DATE_date')
    contrat_history_s = contrat_history_s.sort_values(['INCIDENT_NUMBER', 'UPD_DATE'], ascending=[True,False]).drop_duplicates(keep='first', subset=['INCIDENT_NUMBER'])
    data = data.merge(contrat_history_s, how='left').merge(nature_code_eau_chaude, how='left')
    data = data.merge(nature_code_energie, how='left').merge(nature_code_fonction, how='left')
    data = data.merge(nature_code_installation, how='left').merge(nature_code_specification, how='left')
    print(data.shape[0], data.INCIDENT_NUMBER.nunique(), data.INSTANCE_ID.nunique())
    return data

In [24]:
drop_columns = ['CRE_DATE_date', 'UPD_DATE_date']

In [25]:
train_data = prepare_data(intervention_train_df).drop(drop_columns, axis=1)

1048030 1048030 462663


In [26]:
test_data = prepare_data(intervention_test_df).drop(drop_columns, axis=1)

226770 226770 226770


In [27]:
date_field = 'CRE_DATE_GZL'

In [28]:
date_columns = ['INSTALL_DATE', 'RACHAT_DATE', 'SCHEDULED_START_DATE', 'SCHEDULED_END_DATE', 'CRE_DATE', 'UPD_DATE', 'DATE_DEBUT', 'DATE_FIN', 'DATE_RESILIATION']
date_delta_colums = [['SCHEDULED_START_DATE', 'SCHEDULED_END_DATE'],]

In [30]:
def prepare_X(data):
    # type: (object) -> object
    X = data[[c for c in data.columns if 'visite' not in c]]

    for i,c in enumerate(date_delta_colums):
        X['date_delta_{}'.format(i)] = (X[c[1]] - X[c[0]]).dt.days

    for c in data.columns:
        if c != date_field and c in date_columns:
            X[c] = (X[c] - X[date_field]).dt.days

    X['WEEK'] = X[date_field].map(weekf)
    X['DAY'] = X[date_field].map(dayf)
    X.drop(date_field, axis=1, inplace=True)

    to_label = []
    for c in [k for k,v in X.dtypes.to_dict().items() if v=='object']:
        try:
            X[c] = pd.to_numeric(X[c])
            print(c)
        except:
            to_label.append(c)

    for c in to_label:
        le = preprocessing.LabelEncoder()
        X[c] = le.fit_transform(X[c].astype(str))

    print(X.dtypes.unique())

    return X.fillna(-1)

In [31]:
X_train = prepare_X(train_data.drop('target', axis=1))

CODE_POSTAL
CODE_ENERGIE
ORGANIZATION_CODE
NUM_CAMPAGNE
[dtype('int64') dtype('float64')]


In [32]:
X_test = prepare_X(test_data)

CODE_POSTAL
CODE_ENERGIE
ORGANIZATION_CODE
NUM_CAMPAGNE
[dtype('int64') dtype('float64')]


# Modèle

In [33]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1)

In [34]:
rf.fit(X_train, train_data['target'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
predictions = rf.predict_proba(X_test)[:,1]

In [36]:
roc_auc_score(test_target, predictions)

0.7210125800276627

# Feature ranking

In [33]:
import numpy as np
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(20):
    print("%d. feature %s (%f)" % (f + 1, X_test.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature NB_PASSAGE (0.065637)
2. feature INCIDENT_TYPE_NAME (0.047651)
3. feature INCIDENT_TYPE_ID (0.042017)
4. feature COMMENTAIRE_BI (0.035492)
5. feature SCHEDULED_END_DATE (0.033230)
6. feature GRAVITE (0.030845)
7. feature TYPE_BI (0.028783)
8. feature SCHEDULED_START_DATE (0.025776)
9. feature INSTALL_DATE (0.022583)
10. feature INCIDENT_NUMBER (0.021888)
11. feature TYPE_UT (0.020554)
12. feature RUE (0.019418)
13. feature PARTY_ID_OCC (0.019333)
14. feature MILLESIME (0.019017)
15. feature LOCATION_ID (0.018585)
16. feature RESOURCE_ID (0.018558)
17. feature CONTRACT_NUMBER (0.018429)
18. feature PRIX_FACTURE (0.018362)
19. feature INSTANCE_ID (0.018212)
20. feature CRE_DATE (0.018115)


# Ecriture de la prediction benchmark

In [34]:
#pd.DataFrame(predictions).to_csv('data/final/benchmark_submission.csv', sep="|", quoting=csv.QUOTE_NONNUMERIC, encoding='latin1', index=False)

# Modèle à l'interieur du dataset de train, pour vérifier qu'il n'y a pas de biais entre test et train

In [35]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_train, train_data['target'], test_size=0.33, random_state=42)
X_train_.fillna(-1, inplace=True)
X_test_.fillna(-1, inplace=True)
rf.fit(X_train_, y_train_)
predictions_ = rf.predict_proba(X_test_)[:,1]
roc_auc_score(y_test_, predictions_)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


0.7369438332706875

# FIN