In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import sklearn.cross_validation
from sklearn.cross_validation import cross_val_score



In [2]:
def setPred(col):
    if(col > 0):
        return 1
    else:
        return 0
    
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

# preparar/tweks datos 

In [3]:
features = pd.read_csv("features.csv",low_memory = False)
prediccion = pd.read_csv("csv/labels_training_set.csv")

In [4]:
#features = features.loc[features["conversion"]<=5]

In [5]:
#mergeo features y la info q nos tiran, lleno con 0 los q no se
train = pd.merge(prediccion,features,on="person",how= "left").dropna()

In [6]:
prediccion = train["label"]

In [7]:
test = features.loc[~features["person"].isin(train["person"])]

In [8]:
listaDeCategoricos = ["condition_no_convercion","model_no_convercion","color_no_convercion","search_engine"\
                      ,"nombre mas usado dia","ultimo_dia_de_la_semana_activo","ultimo_evento_registrado_cmode",\
                  'sistema_operativo','condition_last','condition_first','condition_med','new_ret_last','new_ret_first']
for word in listaDeCategoricos:
    train[word],test[word] =  target_encode(trn_series = train[word],  \
                                            tst_series = test[word],\
                                            target=prediccion, \
                                            min_samples_leaf=10,\
                                            smoothing=0.1,\
                                            noise_level=0.00)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
train.columns

Index([u'person', u'label', u'ad campaign hit', u'brand listing', u'checkout',
       u'conversion', u'generic listing', u'lead', u'search engine hit',
       u'searched products', u'staticpage', u'viewed product', u'visited site',
       u'model_no_convercion', u'storage_no_convercion',
       u'condition_no_convercion', u'color_no_convercion', u'search_engine',
       u'numero mas usado dia', u'nombre mas usado dia',
       u'ultimo_dia_de_la_semana_activo', u'ultimo_evento_registrado_cmode',
       u'dias_hasta', u'viewed product_last', u'searched products_last',
       u'ad campaign hit_last', u'staticpage_last', u'checkout_last',
       u'search engine hit_last', u'conversion_last', u'generic listing_last',
       u'brand listing_last', u'visited site_last', u'lead_last',
       u'cantidad_de_dias_utilizado', u'media_dia_de_la_semana_no_convercion',
       u'delta_ultimo_primer_dia_actividad',
       u'delta_ultimo_anteulit_tiempo_actividad',
       u'delta_primer_seg_tiempo_activ

In [10]:
exclude = ["ultimo_dia_de_la_semana_activo","cantidad_de_dias_utilizado", \
           "media_dia_de_la_semana_no_convercion",  "storage_no_convercion",\
            "generic listing", 'Estado Excelente','Estado Bien','Estado Muy Bien','Estado Nuevo',\
           "visited site","dias_hasta","staticpage",\
          "dTimeEveMean0","dHTimeEveMean0","dTimeEveMax","search engine hit_last","search engine hit",\
           "searched products_last",'new_ret_first', 'condition_first', 'new_ret_last']
train = train.drop(exclude,axis = 1)
test = test.drop(exclude,axis = 1)
#

In [11]:
prediccion.shape

(19126L,)

In [12]:
train = train.drop(["person","label"],axis = 1)

In [13]:
for word in features.columns:
    features[word] = pd.to_numeric(features[word],errors = "coerce")

In [14]:

X_train, X_test, y_train, y_test = train_test_split(train, prediccion, test_size=0.30, random_state=41)

# Random Forest
# historial cambios tweaks | score

columnas: 
'ad campaign hit', 'brand listing', 'checkout', 'generic listing',
       'lead', 'search engine hit', 'searched products', 'staticpage',
       'viewed product', 'visited site', 'level_0', 'storage_no_convercion',
       'numero mas usado dia', 'condition_no_convercion2',
       'color_no_convercion2', 'search_engine2', 'nombre mas usado dia2']
       
features contraproductivos:
    cantidad de dias utilizado, hora/mes/dia ultimo evento, ultimo dia de la semana activo

test_size = 0.3
cv = 20
n_estimator 50 | .78 (Kaggle .76)
n_estimator 100 | .79 

cambio smootingbasado en: conversiones previas -> conversiones futuras

test_size = 0.3
cv = 20
n_estimator 100 | .80 (Kaggle .78)

agrego ultimo evento

test_size = 0.3
cv = 20
n_estimator 100 | .793


agrego: ultimo dias hasta el 1/6,suceso de cada evento, rellenados con -1

test_size = 0.3
cv = 20
n_estimator 100 | .798

agrego: ultimo dia de la semana activo

test_size = 0.3
cv = 20
n_estimator 100 | .798

cambio relleno de cantidad de veces de cada evento -1 -> mediana
agrego  'delta_ultimo_primer_dia_actividad',
       'delta_ultimo_anteulit_tiempo_actividad',
       'delta_primer_seg_tiempo_actividad',
       'dTimeEve

test_size = 0.3
cv = 20
n_estimator 100 | .832



https://www.youtube.com/watch?v=BSUMBBFjxrY

In [47]:
# probar clasificacion
randomforesttree = RandomForestRegressor(random_state = None, n_jobs=-1)
randomforesttree.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [48]:
#
param_grid= {'max_features': ["auto"], 'n_estimators': [100], 'criterion': ['mse']}
grid_drop = GridSearchCV(randomforesttree, param_grid, cv=20, scoring='roc_auc',n_jobs = -1,pre_dispatch = 4)
grid_drop.fit(X_train, y_train)
print(grid_drop.best_params_, grid_drop.best_score_)

{'max_features': 'auto', 'criterion': 'mse', 'n_estimators': 100} 0.8323142053840852


In [49]:
result = grid_drop.best_estimator_.predict(X_test)

In [50]:
roc_auc_score(y_test, result)

0.8313608030146638

In [51]:
feature_importances = pd.DataFrame(grid_drop.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [52]:
feature_importances

Unnamed: 0,importance
dHTimeEveMax,0.079809
checkout,0.078849
dHTimeEveMean1,0.059754
viewed product,0.059311
numero mas usado dia,0.0569
checkout_last,0.05507
dTimeEveMean1,0.04852
brand listing,0.045782
ultimo_evento_registrado_cmode,0.042356
ad campaign hit,0.042268


# Gradient Boost
# historial cambios tweaks | score 

columnas: 
'ad campaign hit', 'brand listing', 'checkout', 'generic listing',
       'lead', 'search engine hit', 'searched products', 'staticpage',
       'viewed product', 'visited site', 'level_0', 'storage_no_convercion',
       'numero mas usado dia', 'condition_no_convercion2',
       'color_no_convercion2', 'search_engine2', 'nombre mas usado dia2']


cambio smootingbasado en: conversiones previas -> conversiones futuras

agrego ultimo evento

agrego dias hasta el 1/6

test_size = 0.3
cv = 20
n_estimator 100 | .82 (Kaggle .815)


agrego: ultimo dia de semana

test_size = 0.3
cv = 20
n_estimator 100 | .820

agrego: ultimo suceso de cada evento, rellenados con -1

test_size = 0.3
cv = 20
n_estimator 100 | .824

agrego: cantidad de dias utilizado

test_size = 0.3
cv = 20
n_estimator 100 | .824



In [52]:
gradientboost = GradientBoostingRegressor(random_state = None)
gradientboost.get_params()

{'alpha': 0.9,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

busco mejores hiper-parametros mintras cambio el cv

In [72]:
for i in range(1,15):
    cv = i*10
    param_grid= {'max_features': ["auto"], 'n_estimators': [10,20,25,30,40,50,100], 'loss': ['ls']}
    grid_drop = GridSearchCV(gradientboost, param_grid, cv=cv, scoring='roc_auc')
    grid_drop.fit(X_train, y_train)
    print(grid_drop.best_params_, grid_drop.best_score_)
    result2 = grid_drop.best_estimator_.predict(X_test)
    print("score de la iteracion: ",i," ",roc_auc_score(y_test, result2))

({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 40}, 0.8565313413746177)
('score de la iteracion: ', 1, ' ', 0.86340479944725568)
({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 40}, 0.8577753181925692)
('score de la iteracion: ', 2, ' ', 0.86341319617085399)
({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 50}, 0.856673121967796)
('score de la iteracion: ', 3, ' ', 0.85961038002971246)
({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 40}, 0.8589067299612458)
('score de la iteracion: ', 4, ' ', 0.8634761715978424)
({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 40}, 0.8570184085694112)
('score de la iteracion: ', 5, ' ', 0.86329024414673405)
({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 40}, 0.8556817266832846)
('score de la iteracion: ', 6, ' ', 0.86336101653134956)
({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 20}, 0.8583536846956211)
('score de la iteracion: ', 7, ' ', 0.86121775283284463)
({'max_features': 'auto', 'loss': 'l

busqueda con los hiper-parametros encontrados

In [74]:
param_grid= {'max_features': ["auto"], 'n_estimators': [40], 'loss': ['ls']}
grid_drop = GridSearchCV(gradientboost, param_grid, cv=80, scoring='roc_auc')
grid_drop.fit(X_train, y_train)
print(grid_drop.best_params_, grid_drop.best_score_)

({'max_features': 'auto', 'loss': 'ls', 'n_estimators': 40}, 0.856007904770962)


In [75]:
result2 = grid_drop.best_estimator_.predict(X_test)

In [76]:
roc_auc_score(y_test, result2)

0.8633580177014929

In [56]:
feature_importances2 = pd.DataFrame(grid_drop.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances2

Unnamed: 0,importance
checkout,0.242669
checkout_last,0.165587
ultimo_evento_registrado_cmode,0.064841
brand listing,0.055406
viewed product_last,0.050763
viewed product,0.04617
conversion_last,0.043289
dHTimeEveMax,0.036533
dHTimeEveMean1,0.034571
conversion,0.031594


# Bagging

In [40]:
rng = sklearn.utils.check_random_state(0)
ensemble = BaggingRegressor(base_estimator=RandomForestRegressor(),
                                n_estimators = 800,
                                max_samples=0.9,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

In [41]:
result2 = ensemble.predict(X_test)

In [42]:
roc_auc_score(y_test, result2)

0.85234541481913761

# Summit



In [77]:
to_pred = pd.read_csv("csv/trocafone_kaggle_test.csv")
features = pd.read_csv("features.csv")

In [78]:
to_pred_completo = pd.merge(to_pred,test,on="person",how = "left").dropna()

In [79]:
to_pred_completo["label"] = grid_drop.best_estimator_.predict(to_pred_completo.drop(["person"],axis = 1))

In [80]:
to_pred = pd.merge(to_pred_completo[["person","label"]],to_pred,on="person",how="right")

In [81]:
#to_pred["label"] = to_pred["label"].apply(richGetRicher)
to_pred["label"].loc[to_pred["label"] < 0] = 0
to_pred["label"].loc[to_pred["label"] > 1] = 1

In [82]:
# mode < median < mean
to_pred = to_pred.fillna(to_pred["label"].mean())

In [83]:
to_pred.to_csv("sumit0.2", index=False)

In [84]:
def richGetRicher(x):
    if(x > .2):
        return x *10
    else: return x

In [85]:
to_pred["label"].isna().sum()

0