In [1]:
import pandas as pd
import sklearn as skl
import xgboost as xgb
import category_encoders as ce
from datetime import timedelta
from sklearn.metrics import explained_variance_score

cols = ['date','device_id']
auctions_csv = pd.read_csv('./data/auctions.csv',usecols=cols)

In [3]:
auctions = auctions_csv.rename({'device_id':'ref_hash'}, axis = 1, inplace=False)

In [4]:
def agregar_st(hash_ref):
    return (str(hash_ref) + '_st' )

auctions['ref_hash'] = auctions['ref_hash'].apply(agregar_st)
auctions.head()

Unnamed: 0,date,ref_hash
0,2019-04-23 18:58:00.842116,2564673204772915246_st
1,2019-04-23 18:58:01.530771,4441121667607578179_st
2,2019-04-23 18:58:01.767562,7721769811471055264_st
3,2019-04-23 18:58:02.363468,6416039086842158968_st
4,2019-04-23 18:58:02.397559,1258642015983312729_st


In [5]:
target = pd.read_csv('./data/target_competencia_ids.csv', usecols = ['ref_hash'])

target.head()

Unnamed: 0,ref_hash
0,1000169251625791246_sc
1,1000169251625791246_st
2,1000395625957344683_sc
3,1000395625957344683_st
4,1003027494996471685_sc


In [7]:
auctions = auctions.merge(target, on='ref_hash', how='inner')
auctions.head()

Unnamed: 0,date,ref_hash
0,2019-04-23 18:58:28.872668,6781020342679131794_st
1,2019-04-23 19:11:06.104243,6781020342679131794_st
2,2019-04-23 19:19:04.310737,6781020342679131794_st
3,2019-04-23 19:20:17.592881,6781020342679131794_st
4,2019-04-23 19:21:51.217194,6781020342679131794_st


In [15]:
len(auctions['ref_hash'].unique())

4037

In [18]:
subastas = auctions
subastas.rename({'date':'t_sum'},axis=1,inplace=True)
subastas['n_subastas'] = 1
subastas['t_primera_subasta'] = subastas['t_sum']

subastas.head()

Unnamed: 0,t_sum,ref_hash,n_subastas,t_primera_subasta
0,2019-04-23 18:58:28.872668,6781020342679131794_st,1,2019-04-23 18:58:28.872668
1,2019-04-23 19:11:06.104243,6781020342679131794_st,1,2019-04-23 19:11:06.104243
2,2019-04-23 19:19:04.310737,6781020342679131794_st,1,2019-04-23 19:19:04.310737
3,2019-04-23 19:20:17.592881,6781020342679131794_st,1,2019-04-23 19:20:17.592881
4,2019-04-23 19:21:51.217194,6781020342679131794_st,1,2019-04-23 19:21:51.217194


In [24]:
def generar_ventanas(df, str_columna_fecha):
    FECHA_INICIO = pd.to_datetime(pd.to_datetime(df[str_columna_fecha].min()).strftime('%Y-%m-%d'))
    FECHA_FIN = pd.to_datetime(pd.to_datetime(df[str_columna_fecha].max()).strftime('%Y-%m-%d'))

    FECHA_UN_DIA = FECHA_INICIO + timedelta(days=1)
    FECHA_DOS_DIAS = FECHA_INICIO + timedelta(days=2)
    FECHA_TRES_DIAS = FECHA_INICIO + timedelta(days=3)
    FECHA_CUATRO_DIAS = FECHA_INICIO + timedelta(days=4)
    FECHA_CINCO_DIAS = FECHA_INICIO + timedelta(days=5)
    FECHA_SEIS_DIAS = FECHA_INICIO + timedelta(days=6)
    FECHA_SIETE_DIAS = FECHA_INICIO + timedelta(days=7)

    df[str_columna_fecha] = pd.to_datetime(df[str_columna_fecha])
    
    ventana_uno = df.loc[(df[str_columna_fecha] < FECHA_TRES_DIAS)]
    ventana_uno = ventana_uno.loc[(df[str_columna_fecha] > FECHA_INICIO)]

    ventana_dos = df.loc[(df[str_columna_fecha] < FECHA_CUATRO_DIAS)]
    ventana_dos = ventana_dos.loc[(df[str_columna_fecha] > FECHA_UN_DIA)]

    ventana_tres = df.loc[(df[str_columna_fecha] < FECHA_CINCO_DIAS)]
    ventana_tres = ventana_tres.loc[(df[str_columna_fecha] > FECHA_DOS_DIAS)]
    
    ventana_cuatro = df.loc[(df[str_columna_fecha] < FECHA_SEIS_DIAS)]
    ventana_cuatro = ventana_tres.loc[(df[str_columna_fecha] > FECHA_TRES_DIAS)]
    
    ventana_cinco = df.loc[(df[str_columna_fecha] < FECHA_SIETE_DIAS)]
    ventana_cinco = ventana_tres.loc[(df[str_columna_fecha] > FECHA_CUATRO_DIAS)]
    
    return ventana_uno, ventana_dos, ventana_tres, ventana_cuatro, ventana_cinco

subastas.sort_values(by='t_sum', axis=0, inplace=True)
subastas_ventana_uno, subastas_ventana_dos, subastas_ventana_tres, subastas_ventana_cuatro, subastas_ventana_cinco = generar_ventanas(subastas,'t_sum')

In [32]:
def t_sum(t_inst):
    suma = (pd.to_datetime(t_inst.iloc[0]) - pd.to_datetime(pd.to_datetime(t_inst.iloc[0]).strftime('%Y-%m-%d'))).total_seconds()
    for i in range(1, len(t_inst)):
        suma += (pd.to_datetime(t_inst.iloc[i]) - pd.to_datetime(t_inst.iloc[i-1])).total_seconds()
    return suma

def t_primera_subasta(t_inst):
    return (pd.to_datetime(t_inst.iloc[0]) - pd.to_datetime(pd.to_datetime(t_inst.iloc[0]).strftime('%Y-%m-%d'))).total_seconds()

sv1 = subastas_ventana_uno.groupby(['ref_hash']).agg({'t_sum':t_sum, 't_primera_subasta':t_primera_subasta,'n_subastas':'sum'})
sv2 = subastas_ventana_dos.groupby(['ref_hash']).agg({'t_sum':t_sum, 't_primera_subasta':t_primera_subasta,'n_subastas':'sum'})
sv3 = subastas_ventana_tres.groupby(['ref_hash']).agg({'t_sum':t_sum, 't_primera_subasta':t_primera_subasta, 'n_subastas':'sum'})
sv4 = subastas_ventana_cuatro.groupby(['ref_hash']).agg({'t_sum':t_sum, 't_primera_subasta':t_primera_subasta,'n_subastas':'sum'})
sv5 = subastas_ventana_cinco.groupby(['ref_hash']).agg({'t_sum':t_sum, 't_primera_subasta':t_primera_subasta, 'n_subastas':'sum'})

sv1.head()

Unnamed: 0_level_0,n_subastas,t_sum,t_primera_subasta
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000169251625791246_st,10,156901.93517,10509.519448
1000395625957344683_st,18,108005.261223,65601.9109
1003027494996471685_st,1,75911.48112,75911.48112
1007573308966476713_st,2,79555.025066,18137.251334
1010070503877148763_st,3,82077.156191,9396.648854


In [35]:
ventanaUnoData = sv1.drop(['t_primera_subasta'], axis=1).values
ventanaUnoTargets = sv1['t_primera_subasta'].values

ventanaDosData = sv2.drop(['t_primera_subasta'],axis=1).values
ventanaDosTargets = sv2['t_primera_subasta'].values

ventanaTresData = sv3.drop(['t_primera_subasta'], axis=1).values
ventanaTresTargets = sv3['t_primera_subasta'].values

ventanaCuatroData = sv4.drop(['t_primera_subasta'], axis=1).values
ventanaCuatroTargets = sv4['t_primera_subasta'].values

ventanaCincoData = sv5.drop(['t_primera_subasta'], axis=1).values
ventanaCincoTargets = sv5['t_primera_subasta'].values

xgbVentanaUno = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')

xgbVentanaDos = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')

xgbVentanaTres = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')

xgbVentanaCuatro = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')

xgbVentanaCinco = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')

In [36]:
xgbVentanaUno.fit(ventanaUnoData, ventanaUnoTargets)
xgbVentanaDos.fit(ventanaDosData, ventanaDosTargets)
xgbVentanaTres.fit(ventanaTresData, ventanaTresTargets)
xgbVentanaCuatro.fit(ventanaCuatroData, ventanaCuatroTargets)
xgbVentanaCinco.fit(ventanaCincoData, ventanaCincoTargets)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.08, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.75, verbosity=1)

In [46]:
prediccionV1_V4 = xgbVentanaUno.predict(ventanaCuatroData)
explained_variance_score(prediccionV1_V4, ventanaCuatroTargets)

-1.3327288476241623

In [47]:
# Me quiero morir