In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier, BaggingRegressor, AdaBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
train_data = pd.read_csv('train_data.csv',index_col = 0)
test_kaggle = pd.read_csv('test_kaggle.csv',index_col=0)

In [3]:
train_data.columns

Index(['person', 'label', 'Bom', 'Bom - Sem Touch ID', 'Excelente',
       'Muito Bom', 'Novo', 'visitas_dom', 'visitas_lun', 'visitas_mar',
       'visitas_mier', 'visitas_jue', 'visitas_vier', 'visitas_sab',
       'conversiones_mean', 'vio_mas_de_5_veces', 'vio_color_mas_vendido',
       'returns_por_persona_mean', 'solicito_brand_listing',
       'productos_vistos_mean', 'busca_palabra_popular', '128GB', '16GB',
       '256GB', '32GB', '4GB', '512MB', '64GB', '8GB', 'periodo_y',
       'ad campaign hit_mean', 'brand listing_mean', 'checkout_mean',
       'conversion_mean', 'generic listing_mean', 'lead_mean',
       'search engine hit_mean', 'searched products_mean', 'staticpage_mean',
       'viewed product_mean', 'visited site_mean', 'tiempo_ultima_conversion',
       'tiempo_ultimo_checkout', 'tiempo_ultima_visita', 'tiempo_ultimo_lead',
       'tiempo_entre_conversiones_mean_log_x',
       'tiempo_entre_conversiones_mean_log_y',
       'tiempo_entre_checkout_mean_log', 'vio_pro

In [3]:
features = ['Bom', 'Bom - Sem Touch ID', 'Excelente',
       'Muito Bom', 'Novo', 'visitas_dom', 'visitas_lun', 'visitas_jue', 'visitas_vier',
       'conversiones_mean', 'vio_mas_de_5_veces', 'vio_color_mas_vendido',
       'returns_por_persona_mean', 'solicito_brand_listing',
       'productos_vistos_mean', '128GB',
       '16GB', '256GB', '32GB', '4GB', '512MB', '64GB', '8GB', 'periodo_y',
       'ad campaign hit_mean', 'brand listing_mean', 'checkout_mean',
       'conversion_mean', 'generic listing_mean', 'lead_mean',
       'search engine hit_mean', 'searched products_mean',
       'staticpage_mean', 'viewed product_mean', 'visited site_mean',
       'tiempo_ultima_conversion','tiempo_ultimo_checkout','tiempo_ultima_visita','tiempo_ultimo_lead','vio_producto_popular','Samsung', 'tiempo_entre_conversion_checkout', 'checkout',
       'conversion', 'proporcion','cant_conversiones_ultimo_periodo','dias_desde_ultimo_evento']

# Creando los algoritmos con los parametros obtenidos con grid-search

In [4]:
xgbRegressor = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.8, seed=15, silent=True,
       subsample=0.8)

xgbClassifier = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.8, seed=15, silent=True,
       subsample=0.8)

In [5]:
randomForestClassifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=9, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=45, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [6]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# Organizando los sets de entrenamiento/test

In [5]:
X = train_data[features]
Y = train_data['label']

X_train,X_test,Y_train,Y_test = train_test_split(X,Y)

In [6]:
kfold = KFold(n_splits=10)

In [7]:
mejor_score = 0
for train, test in kfold.split(X_train, Y_train):
    train_datas = train_data.iloc[train]
    test_datas = train_data.iloc[test]
    
    X_train_data = train_datas[features]
    Y_train_data = train_datas['label']
    
    X_test_data = test_datas[features]
    Y_test_data = test_datas['label']
    
    xgbRegressor.fit(X_train_data,Y_train_data)
    print(roc_auc_score(Y_test_data,xgbRegressor.predict(X_test_data)))
    if (roc_auc_score(Y_test_data, xgbRegressor.predict(X_test_data))>mejor_score):
        mejor_score = roc_auc_score(Y_test_data,xgbRegressor.predict(X_test_data))
        X_train_optimo = X_train_data
        Y_train_optimo = Y_train_data
        X_test_optimo = X_test_data
        Y_test_optimo = Y_test_data

0.8591262370263095
0.8510475977819956
0.8707886307011745
0.8644985465116279
0.8488033943554538
0.8534236100833703
0.8658371472158657
0.8454590347923681
0.8873936255619781
0.8955623795761078


# Voting

%85 en kaggle

In [14]:
voting = VotingClassifier(estimators = [('xgb',xgbClassifier),('randomforest',randomForestClassifier),('knn',knn)],voting='soft')

In [15]:
voting.fit(X_train_optimo,Y_train_optimo)

VotingClassifier(estimators=[('xgb', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logisti...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [16]:
predic = voting.predict_proba(test_kaggle[features])

In [17]:
predic = pd.DataFrame(predic)

In [19]:
test_kaggle['prediction'] = predic[1]

In [12]:
submit_kaggle = pd.DataFrame({'person':test_kaggle['person'],'label':test_kaggle['prediction']})

submit_kaggle['label'] = submit_kaggle['label'].transform(lambda x: 0.0 if x < 0.0 else x)
submit_kaggle['label'] = submit_kaggle['label'].transform(lambda x: 1.0 if x > 1.0 else x)

submit_kaggle.reset_index(inplace = True)
submit_kaggle.drop(columns='index',inplace=True)
submit_kaggle.set_index(keys='person',inplace=True)

submit_kaggle.to_csv('submit_kaggle.csv')

# Bagging

% 87.200 ----> MEJOR RESULTADO KAGGLE QUE XGBOOST SOLO 

In [8]:
bagging = BaggingRegressor(base_estimator=xgbRegressor,n_estimators=95)

In [9]:
bagging.fit(X_train_optimo,Y_train_optimo)

BaggingRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.8, seed=15, silent=True,
       subsample=0.8),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=95, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [10]:
roc_auc_score(Y_test_optimo,bagging.predict(X_test_optimo))

0.8905647880539499

In [11]:
test_kaggle['prediction'] = bagging.predict(test_kaggle[features])

# AdaBoost

%86,62 SCORE DE KAGGLE

In [9]:
adaboost = AdaBoostRegressor(base_estimator=xgbRegressor,n_estimators=95,learning_rate=0.075)

In [10]:
adaboost.fit(X_train_optimo,Y_train_optimo)

AdaBoostRegressor(base_estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.075,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=95, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.8, seed=15, silent=True,
       subsample=0.8),
         learning_rate=0.075, loss='linear', n_estimators=95,
         random_state=None)

In [11]:
roc_auc_score(Y_test_optimo,adaboost.predict(X_test_optimo))

0.881864161849711

In [12]:
test_kaggle['prediction'] = adaboost.predict(test_kaggle[features])