# Prédiction de l'obtention d'un brevet
### Samuel Guilhem-Ducléon

In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# We extract the csv files

train_fname = 'train.csv'
test_fname = 'test.csv'
dataframe = pd.read_csv(train_fname, sep=';')
df_test = pd.read_csv(test_fname, sep=';')

In [4]:
dataframe.head(5)

Unnamed: 0,VOIE_DEPOT,COUNTRY,SOURCE_BEGIN_MONTH,APP_NB,APP_NB_PAYS,APP_NB_TYPE,FISRT_APP_COUNTRY,FISRT_APP_TYPE,LANGUAGE_OF_FILLING,FIRST_CLASSE,...,oecd_NB_BACKWARD_NPL,IDX_ORIGIN,SOURCE_IDX_ORI,IDX_RADIC,SOURCE_IDX_RAD,VARIABLE_CIBLE,PRIORITY_MONTH,FILING_MONTH,PUBLICATION_MONTH,BEGIN_MONTH
0,EURO-PCT,US,D0,1,1,1,US,COMPAGNY,en,A61K9/48,...,0.0,1.0,IMPUT,1.0,IMPUT,GRANTED,09/1995,08/1996,03/1997,09/1995
1,EURO-PCT,DE,D0,1,1,1,DE,COMPAGNY,de,C08G65/26,...,0.0,1.0,IMPUT,1.0,IMPUT,GRANTED,11/1999,11/2000,05/2001,11/1999
2,VOIE DIRECTE,JP,D0,1,1,1,JP,COMPAGNY,en,G11B15/07,...,0.125,0.897778,CALC,1.0,IMPUT,GRANTED,06/1992,05/1993,12/1993,06/1992
3,VOIE DIRECTE,GB,D0,1,1,1,GB,COMPAGNY,en,F25B41/04,...,0.0,1.0,IMPUT,1.0,IMPUT,GRANTED,05/1994,05/1995,11/1995,05/1994
4,VOIE DIRECTE,DE,D0,1,1,1,DE,COMPAGNY,de,H01F17/06,...,0.0,0.625,CALC,0.25,CALC,GRANTED,12/1988,11/1989,06/1990,12/1988


In [5]:
print('n_samples : %d, n_variables : %d' % dataframe.shape)

n_samples : 259431, n_variables : 50


In [6]:
numerical = ['APP_NB', 'APP_NB_PAYS', 'APP_NB_TYPE', 'NB_CLASSES', 'NB_ROOT_CLASSES', 'NB_SECTORS', 'NB_FIELDS', 
        'INV_NB', 'INV_NB_PAYS',
        'INV_NB_TYPE', 'cited_n', 'cited_nmiss', 'cited_age_min', 'cited_age_median', 'cited_age_max', 'cited_age_mean',
        'cited_age_std', 'NB_BACKWARD_NPL', 'NB_BACKWARD_XY', 'NB_BACKWARD_I', 'NB_BACKWARD_AUTRE', 'NB_BACKWARD_PL', 
        'NB_BACKWARD', 'pct_NB_IPC', 'pct_NB_IPC_LY', 'oecd_NB_ROOT_CLASSES', 'oecd_NB_BACKWARD_PL', 
        'oecd_NB_BACKWARD_NPL', 'IDX_ORIGIN', 'IDX_RADIC']

categories = ['SOURCE_BEGIN_MONTH', 'VOIE_DEPOT', 'LANGUAGE_OF_FILLING', 'COUNTRY', 'FISRT_APP_COUNTRY', 
        'FISRT_APP_TYPE',
        'TECHNOLOGIE_SECTOR', 'TECHNOLOGIE_FIELD', 'FISRT_INV_COUNTRY', 'FISRT_INV_TYPE', 'SOURCE_CITED_AGE', 
        'SOURCE_IDX_ORI', 'SOURCE_IDX_RAD', 'MAIN_IPC', 'FIRST_CLASSE']

dates = ['PRIORITY_MONTH', 'FILING_MONTH', 'PUBLICATION_MONTH', 'BEGIN_MONTH']


# We convert dates to integer: month + year * 12

def convert(s):
    if not pd.isnull(s):
        r = s.split('/')
        return int(r[0]) + 12 * int(r[1])
    else:
        return np.nan


for column in dates:
    dataframe[column] = dataframe[column].apply(convert)
    df_test[column] = df_test[column].apply(convert)
    
# We replace the missing values in the numerical features by the mean of the column

for num in numerical:
    dataframe[num][dataframe[num] == np.nan] = dataframe[num].mean()
    df_test[num][df_test[num] == np.nan] = df_test[num].mean()

# Normalization for numerical values

dataframe[numerical] = (dataframe[numerical] - dataframe[numerical].mean()) / dataframe[numerical].std()
df_test[numerical] = (df_test[numerical] - df_test[numerical].mean()) / df_test[numerical].std()

# We group the numerical and dates features

X = np.concatenate([dataframe[numerical].values, dataframe[dates].values], axis = 1)
X_test = np.concatenate([df_test[numerical].values, df_test[dates].values], axis = 1)


# We transform each categorial variable into indicator variables

for cat in categories:
    counts = pd.value_counts(pd.concat([dataframe[cat], df_test[cat]], axis = 0))

    if cat == 'FIRST_CLASSE':
        ratio = 20
    elif cat == 'MAIN_IPC':
        ratio = 20
    else:
        ratio = 1000
        
    # We keep the most relevant categories
    
    columns_to_keep = counts[counts > counts[0]/ratio].index

    mask = ~dataframe[cat].isin(columns_to_keep)
    
    # We replace the less relevant categories by "x" : category "other"
    
    if cat == 'SOURCE_BEGIN_MONTH': # so that X and X_test have the same number of features
        dataframe[cat][mask] = "D0"
    else:
        dataframe[cat][mask] = "x"
    
    mask_test = ~df_test[cat].isin(columns_to_keep)
    df_test[cat][mask_test] = "x"
    
    # get_dummies create one feature for each category
    
    X_test = np.concatenate([X_test, pd.get_dummies(df_test[cat]).astype(np.int8).values], axis = 1)
    X = np.concatenate([X, pd.get_dummies(dataframe[cat]).astype(np.int8).values], axis = 1)

# The label

y = dataframe.VARIABLE_CIBLE == 'GRANTED'

X_real_test = X_test


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# We check that the number of features is the same

print(X.shape)
print(X_test.shape)

(259431, 767)
(129715, 767)


In [8]:
from sklearn.cross_validation import train_test_split

# CV Set and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
# Right format for xgboost

import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

In [None]:
param = {'max_depth':7, 'eta':0.1, 'lambda' : 2, 'min_child_weight': 1, 'alpha': 2, 'silent':1, 'objective':'binary:logistic' }
# 'bst:min_child_weight': 0.5, 'bst:subsample': 0.5, 'colsample_bytree' : 0.8 
param['nthread'] = 4
param['eval_metric'] = 'auc'

evallist  = [(dtest,'eval'), (dtrain,'train')]

num_round = 604
bst = xgb.train( param, dtrain, num_round, evallist )

[0]	eval-auc:0.656285	train-auc:0.662818
[1]	eval-auc:0.664910	train-auc:0.672242
[2]	eval-auc:0.667440	train-auc:0.673971
[3]	eval-auc:0.669805	train-auc:0.676350
[4]	eval-auc:0.671737	train-auc:0.678322
[5]	eval-auc:0.672905	train-auc:0.680150
[6]	eval-auc:0.675350	train-auc:0.683233
[7]	eval-auc:0.676825	train-auc:0.684732
[8]	eval-auc:0.677930	train-auc:0.686328
[9]	eval-auc:0.678980	train-auc:0.687748
[10]	eval-auc:0.680854	train-auc:0.690005
[11]	eval-auc:0.681733	train-auc:0.691245
[12]	eval-auc:0.682986	train-auc:0.692646
[13]	eval-auc:0.685782	train-auc:0.695517
[14]	eval-auc:0.687063	train-auc:0.697064
[15]	eval-auc:0.687507	train-auc:0.697758
[16]	eval-auc:0.687986	train-auc:0.698715
[17]	eval-auc:0.688908	train-auc:0.699662
[18]	eval-auc:0.689714	train-auc:0.700844
[19]	eval-auc:0.690929	train-auc:0.702122
[20]	eval-auc:0.691359	train-auc:0.702877
[21]	eval-auc:0.692169	train-auc:0.704017
[22]	eval-auc:0.692683	train-auc:0.704889
[23]	eval-auc:0.693463	train-auc:0.705864
[2

In [None]:
# GridSearch: very long to run

#from xgboost.sklearn import XGBClassifier
#from sklearn.grid_search import GridSearchCV
#from sklearn.metrics import roc_auc_score

#max_depth = [8,10,12]

#gsearch1 = GridSearchCV(verbose = 10, estimator = XGBClassifier(reg_lambda = 4, reg_alpha = 4, learning_rate = 0.1, n_estimators = 200, max_depth = 5, min_child_weight = 1, gamma = 0, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic', nthread = 8, scale_pos_weight = 1, seed = 42), param_grid = dict(max_depth = max_depth), scoring = 'roc_auc', n_jobs = 8, iid = False, cv = 3)
#gsearch1.fit(X_train, y_train)
#print(gsearch1.grid_scores_)
#print(gsearch1.best_params_)
#print(gsearch1.best_score_)


In [11]:
# We predict the values of the test set online

drealtest = xgb.DMatrix(X_real_test)
y_pred = bst.predict(drealtest)

In [12]:
from sklearn.metrics import roc_auc_score

print('Score sur le cv : %s' % roc_auc_score(y_test, bst.predict(dtest)))
print('Score sur le train : %s' % roc_auc_score(y_train, bst.predict(dtrain)))

Score (optimiste) sur le test : 0.714275401162
Score (optimiste) sur le train : 0.81605391662


In [13]:
np.savetxt('y_pred.txt', y_pred, fmt='%s')