In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import sklearn.cross_validation
from sklearn.cross_validation import cross_val_score

In [2]:
def setPred(col):
    if(col > 0):
        return 1
    else:
        return 0
    
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

# preparar/tweks datos 

In [418]:
features = pd.read_csv("features.csv",low_memory = False)
prediccion = pd.read_csv("csv/labels_training_set.csv")

In [255]:
#features = features.loc[features["conversion"]<=5]

In [419]:
#mergeo features y la info q nos tiran, lleno con 0 los q no se
train = pd.merge(prediccion,features,on="person",how= "left").dropna()

In [420]:
prediccion = train["label"]

In [421]:
test = features.loc[~features["person"].isin(train["person"])]

In [422]:
listaDeCategoricos = ["condition_no_convercion","model_no_convercion","color_no_convercion","search_engine"\
                      ,"nombre mas usado dia","ultimo_dia_de_la_semana_activo","ultimo_evento_registrado_cmode",\
                  ]
for word in listaDeCategoricos:
    train[word],test[word] =  target_encode(trn_series = train[word],  \
                                            tst_series = test[word],\
                                            target=prediccion, \
                                            min_samples_leaf=10,\
                                            smoothing=0.1,\
                                            noise_level=0.00)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [401]:
train.columns

Index(['person', 'label', 'ad campaign hit', 'brand listing', 'checkout',
       'conversion', 'generic listing', 'lead', 'search engine hit',
       'searched products', 'staticpage', 'viewed product', 'visited site',
       'model_no_convercion', 'storage_no_convercion',
       'condition_no_convercion', 'color_no_convercion', 'search_engine',
       'numero mas usado dia', 'nombre mas usado dia',
       'ultimo_dia_de_la_semana_activo', 'ultimo_evento_registrado_cmode',
       'dias_hasta', 'viewed product_last', 'searched products_last',
       'ad campaign hit_last', 'staticpage_last', 'checkout_last',
       'search engine hit_last', 'conversion_last', 'generic listing_last',
       'brand listing_last', 'visited site_last', 'lead_last',
       'cantidad_de_dias_utilizado', 'media_dia_de_la_semana_no_convercion',
       'delta_ultimo_primer_dia_actividad',
       'delta_ultimo_anteulit_tiempo_actividad',
       'delta_primer_seg_tiempo_actividad', 'dTimeEveMax', 'dTimeEveMean0',


In [423]:
exclude = ["ultimo_dia_de_la_semana_activo","cantidad_de_dias_utilizado", \
           "media_dia_de_la_semana_no_convercion", "searched products_last", "storage_no_convercion",\
            "search engine hit","Estado Nuevo","generic listing", "Estado Excelente",\
           "search engine hit_last","visited site","dias_hasta","staticpage",\
          "dTimeEveMean0","dHTimeEveMean0","dTimeEveMax"]
train = train.drop(exclude,axis = 1)
test = test.drop(exclude,axis = 1)

In [424]:
prediccion.shape

(19126,)

In [425]:
train = train.drop(["person","label"],axis = 1)

In [426]:
for word in features.columns:
    features[word] = pd.to_numeric(features[word],errors = "coerce")

In [427]:

X_train, X_test, y_train, y_test = train_test_split(train, prediccion, test_size=0.30, random_state=41)

# Random Forest
# historial cambios tweaks | score

columnas: 
'ad campaign hit', 'brand listing', 'checkout', 'generic listing',
       'lead', 'search engine hit', 'searched products', 'staticpage',
       'viewed product', 'visited site', 'level_0', 'storage_no_convercion',
       'numero mas usado dia', 'condition_no_convercion2',
       'color_no_convercion2', 'search_engine2', 'nombre mas usado dia2']
       
features contraproductivos:
    cantidad de dias utilizado, hora/mes/dia ultimo evento, ultimo dia de la semana activo

test_size = 0.3
cv = 20
n_estimator 50 | .78 (Kaggle .76)
n_estimator 100 | .79 

cambio smootingbasado en: conversiones previas -> conversiones futuras

test_size = 0.3
cv = 20
n_estimator 100 | .80 (Kaggle .78)

agrego ultimo evento

test_size = 0.3
cv = 20
n_estimator 100 | .793


agrego: ultimo dias hasta el 1/6,suceso de cada evento, rellenados con -1

test_size = 0.3
cv = 20
n_estimator 100 | .798

agrego: ultimo dia de la semana activo

test_size = 0.3
cv = 20
n_estimator 100 | .798

cambio relleno de cantidad de veces de cada evento -1 -> mediana
agrego  'delta_ultimo_primer_dia_actividad',
       'delta_ultimo_anteulit_tiempo_actividad',
       'delta_primer_seg_tiempo_actividad',
       'dTimeEve

test_size = 0.3
cv = 20
n_estimator 100 | .832



https://www.youtube.com/watch?v=BSUMBBFjxrY

In [265]:
# probar clasificacion
randomforesttree = RandomForestRegressor(random_state = None, n_jobs=-1)
randomforesttree.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [266]:
#
param_grid= {'max_features': ["auto"], 'n_estimators': [100], 'criterion': ['mse']}
grid_drop = GridSearchCV(randomforesttree, param_grid, cv=20, scoring='roc_auc',n_jobs = -1,pre_dispatch = 4)
grid_drop.fit(X_train, y_train)
print(grid_drop.best_params_, grid_drop.best_score_)

{'max_features': 'auto', 'n_estimators': 100, 'criterion': 'mse'} 0.835923548664443


In [267]:
result = grid_drop.best_estimator_.predict(X_test)

In [268]:
roc_auc_score(y_test, result)

0.8364520528068147

In [269]:
feature_importances = pd.DataFrame(grid_drop.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [270]:
feature_importances

Unnamed: 0,importance
checkout,0.080363
numero mas usado dia,0.071972
Estado Bien,0.058443
viewed product,0.055205
ultimo_evento_registrado_cmode,0.051781
checkout_last,0.049783
brand listing,0.048309
color_no_convercion,0.044726
Estado Muy Bien,0.044279
dTimeEveMax,0.044197


# Gradient Boost
# historial cambios tweaks | score 

columnas: 
'ad campaign hit', 'brand listing', 'checkout', 'generic listing',
       'lead', 'search engine hit', 'searched products', 'staticpage',
       'viewed product', 'visited site', 'level_0', 'storage_no_convercion',
       'numero mas usado dia', 'condition_no_convercion2',
       'color_no_convercion2', 'search_engine2', 'nombre mas usado dia2']


cambio smootingbasado en: conversiones previas -> conversiones futuras

agrego ultimo evento

agrego dias hasta el 1/6

test_size = 0.3
cv = 20
n_estimator 100 | .82 (Kaggle .815)


agrego: ultimo dia de semana

test_size = 0.3
cv = 20
n_estimator 100 | .820

agrego: ultimo suceso de cada evento, rellenados con -1

test_size = 0.3
cv = 20
n_estimator 100 | .824

agrego: cantidad de dias utilizado

test_size = 0.3
cv = 20
n_estimator 100 | .824



In [428]:
gradientboost = GradientBoostingRegressor(random_state = None)
gradientboost.get_params()

{'alpha': 0.9,
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

In [429]:
param_grid= {'max_features': ["auto"], 'n_estimators': [30], 'loss': ['ls']}
grid_drop = GridSearchCV(gradientboost, param_grid, cv=20, scoring='roc_auc')
grid_drop.fit(X_train, y_train)
print(grid_drop.best_params_, grid_drop.best_score_)


{'max_features': 'auto', 'loss': 'ls', 'n_estimators': 30} 0.865684344679409


In [430]:
result2 = grid_drop.best_estimator_.predict(X_test)

In [431]:
roc_auc_score(y_test, result2)

0.865663620826743

In [317]:
feature_importances2 = pd.DataFrame(grid_drop.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances2

Unnamed: 0,importance
checkout,0.254717
checkout_last,0.138533
brand listing,0.059267
conversion,0.049978
dHTimeEveMax,0.042501
ad campaign hit_last,0.041523
visited site_last,0.040924
conversion_last,0.040748
Estado Bien,0.036329
color_no_convercion,0.035725


# Summit



In [435]:
to_pred = pd.read_csv("csv/trocafone_kaggle_test.csv")
features = pd.read_csv("features.csv")

In [436]:
to_pred_completo = pd.merge(to_pred,test,on="person",how = "left").dropna()

In [437]:
to_pred_completo["label"] = grid_drop.best_estimator_.predict(to_pred_completo.drop(["person"],axis = 1))

In [438]:
to_pred = pd.merge(to_pred_completo[["person","label"]],to_pred,on="person",how="right")

In [439]:
#to_pred["label"] = to_pred["label"].apply(richGetRicher)
to_pred["label"].loc[to_pred["label"] < 0] = 0
to_pred["label"].loc[to_pred["label"] > 1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [440]:
# mode < median < mean
to_pred = to_pred.fillna(to_pred["label"].mean())

In [441]:
to_pred.to_csv("sumit0.0", index=False)

In [310]:
def richGetRicher(x):
    if(x > .2):
        return x *10
    else: return x

In [137]:
to_pred["label"].isna().sum()

0

In [60]:
i

0    0.007391
dtype: float64