# Survival analysis para la empresa JAMPP

El objetivo es analizar los datos de la empresa JAMPP para saber cuando un usuario volverá a conectarse a una app, dado sus datos de conexiones previas. La segunda predicción que intentaremos hacer es saber cuando volverá a "convertir", es decir comprar la app de la cual tiene una publicidad en su movil a un momento dado.



In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lifelines
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score
import xgboost
from func import generateSessions
from func import agregarTiempoPromedio

In [2]:
ev_types = {'event_id':'int64',
            'ref_type':'category',
            'ref_hash':'int64',
            'application_id':'int64',
            'attributed':'bool',
            'device_countrycode':'category',
            'device_os_version':'float64',
            'device_brand':'float64',
            'device_model':'float64',
            'device_city':'float64',
            'session_user_agent':'float64',
            'trans_id':'category',
            'user_agent':'float64',
            'event_uuid':'object',
            'carrier':'float64',
            'kind':'float64',
            'device_os':'category',
            'wifi':'category',
            'connection_type':'category',
            'ip_address':'int64',
            'device_language':'float64'}
events = pd.read_csv('./archivos/events.csv', dtype = ev_types)
installs = pd.read_csv('./archivos/installs.csv', dtype = {'wifi':'category'})

  interactivity=interactivity, compiler=compiler, result=result)


### Estudio Datasets de installs y events

### Para aliviar los sets, sacamos las columnas que no me sirvan.
- agrego columna para discriminar una instalacion
- casteo datetimes
- casteo wifi info

In [50]:
ecur = events[['ref_hash', 'event_id', 'date', 'application_id', 'wifi', 'device_model', 'event_uuid']]
icur = installs[['ref_hash', 'created', 'application_id', 'wifi', 'device_model', 'event_uuid']]
icur.rename(columns={'created':'date'},inplace=True)
icur['date'] = pd.to_datetime(icur['date'], format = '%Y-%m-%d %H:%M:%S')
ecur['date'] = pd.to_datetime(ecur['date'], format = '%Y-%m-%d %H:%M:%S')
ecur['ins'] = 0
icur['ins'] = 1
icur['event_id'] = -1
icur.loc[(icur.wifi == 'true'), 'binwifi'] = 1
icur.loc[(icur.wifi == 'false'), 'binwifi'] = 0
ecur.loc[(ecur.wifi == 'True'), 'binwifi'] = 1
ecur.loc[(ecur.wifi == 'False'), 'binwifi'] = 0
ecur.drop(columns= 'wifi',inplace=True)
icur.drop(columns= 'wifi',inplace=True)
icur.binwifi.fillna(value=0.7,inplace=True)
ecur.binwifi.fillna(value=0.3,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexin

# Concateno installs y events

In [52]:
insev = pd.concat([icur,ecur])

### Elimino instalaciones duplicadas

In [53]:
insev.loc[(insev.duplicated(subset='event_uuid')), 'ins'] = 1
insev.drop_duplicates(subset='event_uuid', inplace=True)

### Creo ventanas de 3 dias

In [54]:
insev1 = insev.loc[(insev["date"] < "2019-04-21")]
insev1 = insev1.loc[(insev1["date"] > "2019-04-18")]

In [55]:
insev2 = insev.loc[(insev["date"] < "2019-04-22")]
insev2 = insev2.loc[(insev2["date"] > "2019-04-19")]

In [56]:
insev3 = insev.loc[(insev["date"] < "2019-04-23")]
insev3 = insev3.loc[(insev3["date"] > "2019-04-20")]

In [57]:
insev4 = insev.loc[(insev["date"] < "2019-04-24")]
insev4 = insev4.loc[(insev4["date"] > "2019-04-21")]

In [58]:
insev5 = insev.loc[(insev["date"] < "2019-04-25")]
insev5 = insev5.loc[(insev5["date"] > "2019-04-22")]

### En todas las ventanas elegidas, vamos a agregar una columna de fecha inicial para poder basar los tiempos en base a eso. Despues podremos restar el tiempo inicial al tiempo absoluto para hacer modelos en segundos pasados.

In [59]:
fecha_inicial_1 = pd.to_datetime("2019-04-18 00:00:00")
fecha_inicial_2 = pd.to_datetime("2019-04-19 00:00:00")
fecha_inicial_3 = pd.to_datetime("2019-04-20 00:00:00")
fecha_inicial_4 = pd.to_datetime("2019-04-21 00:00:00")
fecha_inicial_5 = pd.to_datetime("2019-04-22 00:00:00")
insev1['delta_t'] = (insev1.date - fecha_inicial_1)
insev1["delta_t"] = insev1["delta_t"].dt.total_seconds()
insev2['delta_t'] = (insev2.date - fecha_inicial_2)
insev2["delta_t"] = insev2["delta_t"].dt.total_seconds()
insev3['delta_t'] = (insev3.date - fecha_inicial_3)
insev3["delta_t"] = insev3["delta_t"].dt.total_seconds()
insev4['delta_t'] = (insev4.date - fecha_inicial_4)
insev4["delta_t"] = insev4["delta_t"].dt.total_seconds()
insev5['delta_t'] = (insev5.date - fecha_inicial_5)
insev5["delta_t"] = insev5["delta_t"].dt.total_seconds()

### Genero sesiones por usuario

In [60]:
insev1.reset_index(inplace=True)
sess1 = generateSessions(insev1)
insev1['session'] = sess1

In [61]:
insev2.reset_index(inplace=True)
sess2 = generateSessions(insev2)
insev2['session'] = sess2

In [62]:
insev3.reset_index(inplace=True)
sess3 = generateSessions(insev3)
insev3['session'] = sess3

In [63]:
insev4.reset_index(inplace=True)
sess4 = generateSessions(insev4)
insev4['session'] = sess4

In [64]:
insev5.reset_index(inplace=True)
sess5 = generateSessions(insev5)
insev5['session'] = sess5

Agregamos una columna _sum_ para poder contar la cantidad de veces que aparece un device en una ventana de tres días.

In [65]:
insev1["sum"] = 1
insev2["sum"] = 1
insev3["sum"] = 1
insev4["sum"] = 1
insev5["sum"] = 1

### AGREGO FEATURES IMPORTANTES QUE LUEGO AGRUPARE POR USUARIO

In [66]:
#AGREGO COLUMNAS DE DATOS EN INSTALACIONES
insev1.loc[(insev1.ins==1), 'delta_t_ins'] = insev1.delta_t
insev1.loc[(insev1.ins==1), 'app'] = insev1.application_id
insev1.loc[(insev1.ins==1), 'ins_sess'] = insev1.session
insev1.loc[(insev1.ins==1), 'wifi_on_ins'] = insev1.binwifi

insev2.loc[(insev2.ins==1), 'delta_t_ins'] = insev2.delta_t
insev2.loc[(insev2.ins==1), 'app'] = insev2.application_id
insev2.loc[(insev2.ins==1), 'ins_sess'] = insev2.session
insev2.loc[(insev2.ins==1), 'wifi_on_ins'] = insev2.binwifi

insev3.loc[(insev3.ins==1), 'delta_t_ins'] = insev3.delta_t
insev3.loc[(insev3.ins==1), 'app'] = insev3.application_id
insev3.loc[(insev3.ins==1), 'ins_sess'] = insev3.session
insev3.loc[(insev3.ins==1), 'wifi_on_ins'] = insev3.binwifi

insev4.loc[(insev4.ins==1), 'delta_t_ins'] = insev4.delta_t
insev4.loc[(insev4.ins==1), 'app'] = insev4.application_id
insev4.loc[(insev4.ins==1), 'ins_sess'] = insev4.session
insev4.loc[(insev4.ins==1), 'wifi_on_ins'] = insev4.binwifi

insev5.loc[(insev5.ins==1), 'delta_t_ins'] = insev5.delta_t
insev5.loc[(insev5.ins==1), 'app'] = insev5.application_id
insev5.loc[(insev5.ins==1), 'ins_sess'] = insev5.session
insev5.loc[(insev5.ins==1), 'wifi_on_ins'] = insev5.binwifi


In [67]:
#ORDENO POR USUARIO Y DATE
insev1.sort_values(by=['ref_hash','delta_t'], inplace=True)
insev2.sort_values(by=['ref_hash','delta_t'], inplace=True)
insev3.sort_values(by=['ref_hash','delta_t'], inplace=True)
insev4.sort_values(by=['ref_hash','delta_t'], inplace=True)
insev5.sort_values(by=['ref_hash','delta_t'], inplace=True)

In [80]:
insev1.drop(columns=['index','date','event_uuid'], inplace=True)
insev2.drop(columns=['index','date','event_uuid'], inplace=True)
insev3.drop(columns=['index','date','event_uuid'], inplace=True)
insev4.drop(columns=['index','date','event_uuid'], inplace=True)
insev5.drop(columns=['index','date','event_uuid'], inplace=True)

### Genero copia de filas con installs mayor a cero que usare para calcular el t promedio entre installs

In [70]:
t_inst_1 = insev1[insev1.ins == 1]
t_inst_2 = insev1[insev2.ins == 1]
t_inst_3 = insev1[insev3.ins == 1]
t_inst_4 = insev1[insev4.ins == 1]
t_inst_5 = insev1[insev5.ins == 1]

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [71]:
t_inst_1 = t_inst_1[['delta_t', 'ref_hash']]
t_inst_2 = t_inst_2[['delta_t', 'ref_hash']]
t_inst_3 = t_inst_3[['delta_t', 'ref_hash']]
t_inst_4 = t_inst_4[['delta_t', 'ref_hash']]
t_inst_5 = t_inst_5[['delta_t', 'ref_hash']]

In [73]:
t_inst_1 = agregarTiempoPromedio(t_inst_1)
t_inst_2 = agregarTiempoPromedio(t_inst_2)
t_inst_3 = agregarTiempoPromedio(t_inst_3)
t_inst_4 = agregarTiempoPromedio(t_inst_4)
t_inst_5 = agregarTiempoPromedio(t_inst_5)

In [74]:
t_inst_1 = t_inst_1[t_inst_1.time_diff != 0]
t_inst_2 = t_inst_2[t_inst_2.time_diff != 0]
t_inst_3 = t_inst_3[t_inst_3.time_diff != 0]
t_inst_4 = t_inst_4[t_inst_4.time_diff != 0]
t_inst_5 = t_inst_5[t_inst_5.time_diff != 0]

In [75]:
t_inst_1 = t_inst_1[['ref_hash','time_diff']].groupby('ref_hash').mean()
t_inst_2 = t_inst_2[['ref_hash','time_diff']].groupby('ref_hash').mean()
t_inst_3 = t_inst_3[['ref_hash','time_diff']].groupby('ref_hash').mean()
t_inst_4 = t_inst_4[['ref_hash','time_diff']].groupby('ref_hash').mean()
t_inst_5 = t_inst_5[['ref_hash','time_diff']].groupby('ref_hash').mean()

In [76]:
t_inst_1 = t_inst_1.stack().reset_index()
t_inst_1 = t_inst_1.drop(columns = ['level_1'])
t_inst_2 = t_inst_2.stack().reset_index()
t_inst_2 = t_inst_2.drop(columns = ['level_1'])
t_inst_3 = t_inst_3.stack().reset_index()
t_inst_3 = t_inst_3.drop(columns = ['level_1'])
t_inst_4 = t_inst_4.stack().reset_index()
t_inst_4 = t_inst_4.drop(columns = ['level_1'])
t_inst_5 = t_inst_5.stack().reset_index()
t_inst_5 = t_inst_5.drop(columns = ['level_1'])

In [77]:
t_inst_1.columns = ['ref_hash', 'tiempo_prom_entre_installs']
t_inst_2.columns = ['ref_hash', 'tiempo_prom_entre_installs']
t_inst_3.columns = ['ref_hash', 'tiempo_prom_entre_installs']
t_inst_4.columns = ['ref_hash', 'tiempo_prom_entre_installs']
t_inst_5.columns = ['ref_hash', 'tiempo_prom_entre_installs']

### Mergeo lo obtenido

In [78]:
insev1 = insev1.merge(t_inst_1, how = 'left')
insev2 = insev2.merge(t_inst_2, how = 'left')
insev3 = insev3.merge(t_inst_3, how = 'left')
insev4 = insev4.merge(t_inst_4, how = 'left')
insev5 = insev5.merge(t_inst_5, how = 'left')

insev1.head()

Unnamed: 0,index,application_id,binwifi,date,device_model,event_id,event_uuid,ins,ref_hash,delta_t,session,sum,delta_t_ins,app,ins_sess,wifi_on_ins,tiempo_prom_entre_installs
0,398921,121,0.0,2019-04-18 21:11:50.326,1.658417e+18,-1,2f8be0cc-297e-4c9c-a097-1096aa5824b5,1,41863526108385,76310.326,0.0,1,76310.326,121.0,0.0,0.0,326.205
1,3315474,121,0.0,2019-04-18 21:13:05.063,1.658417e+18,106,553003e4-be49-40d2-9e40-dfc22da88057,0,41863526108385,76385.063,0.0,1,,,,,326.205
2,112200,65,0.0,2019-04-18 21:17:16.531,1.658417e+18,-1,f07fd6db-2f55-4539-b9ab-76f18ff4f51d,1,41863526108385,76636.531,0.0,1,76636.531,65.0,0.0,0.0,326.205
3,3146626,65,0.0,2019-04-18 21:17:21.831,1.658417e+18,15,b9ca7213-fdc4-4638-9319-111edb95539e,0,41863526108385,76641.831,0.0,1,,,,,326.205
4,3713263,65,0.0,2019-04-18 21:17:58.316,1.658417e+18,13,9d6c84b4-5ff5-4879-879c-61f32454369d,0,41863526108385,76678.316,0.0,1,,,,,326.205


##### A los registros sin installs le doy un promedio entre installs de 3 dias

In [79]:
insev1.loc[insev1.ins == 0, 'tiempo_prom_entre_intalls'] = 259200
insev2.loc[insev2.ins == 0, 'tiempo_prom_entre_intalls'] = 259200
insev3.loc[insev3.ins == 0, 'tiempo_prom_entre_intalls'] = 259200
insev4.loc[insev4.ins == 0, 'tiempo_prom_entre_intalls'] = 259200
insev5.loc[insev5.ins == 0, 'tiempo_prom_entre_intalls'] = 259200

insev5.head()

Unnamed: 0,index,application_id,binwifi,date,device_model,event_id,event_uuid,ins,ref_hash,delta_t,session,sum,delta_t_ins,app,ins_sess,wifi_on_ins,tiempo_prom_entre_installs,tiempo_prom_entre_intalls
0,2190853,77,0.0,2019-04-23 15:01:20.504,2.019322e+18,7,cd2ed243-9e02-4620-9c6d-cea63b97e37b,0,40621409780134,140480.504,0.0,1,,,,,,259200.0
1,2190668,77,0.0,2019-04-23 15:01:35.855,2.019322e+18,364,cf8e678c-4321-45d1-a1ab-31ec717907e6,0,40621409780134,140495.855,0.0,1,,,,,,259200.0
2,2190580,77,0.0,2019-04-23 15:02:00.589,2.019322e+18,363,29588999-39c0-40e4-a2ae-c8c138535e68,0,40621409780134,140520.589,0.0,1,,,,,,259200.0
3,7727097,226,1.0,2019-04-23 17:32:16.586,7.403565e+18,15,81e366b6-1440-4c42-8ba4-e363198bd3b8,0,69039685746313,149536.586,0.0,1,,,,,,259200.0
4,7690417,226,1.0,2019-04-23 17:32:16.784,7.403565e+18,287,0694da9b-0427-46de-87e5-d4261bbcaa41,0,69039685746313,149536.784,0.0,1,,,,,,259200.0


### Agrupo y emprolijo los features

In [111]:
g_insev1 = insev1.groupby('ref_hash')\
               .agg({'session':'max','sum':'sum','ins':'sum','delta_t_ins':['min','mean'],'tiempo_prom_entre_installs':'min','wifi_on_ins':'mean'})
g_insev2 = insev2.groupby('ref_hash')\
               .agg({'session':'max','sum':'sum','ins':'sum','delta_t_ins':['min','mean'],'tiempo_prom_entre_installs':'max','wifi_on_ins':'mean'})
g_insev3 = insev3.groupby('ref_hash')\
               .agg({'session':'max','sum':'sum','ins':'sum','delta_t_ins':['min','mean'],'tiempo_prom_entre_installs':'max','wifi_on_ins':'mean'})
g_insev4 = insev4.groupby('ref_hash')\
               .agg({'session':'max','sum':'sum','ins':'sum','delta_t_ins':['min','mean'],'tiempo_prom_entre_installs':'max','wifi_on_ins':'mean'})
g_insev5 = insev5.groupby('ref_hash')\
               .agg({'session':'max','sum':'sum','ins':'sum','delta_t_ins':['min','mean'],'tiempo_prom_entre_installs':'max','wifi_on_ins':'mean'})

In [112]:
g_insev1.columns = ['cant_sess','cant_ev','cant_ins','t_1er_ins','t_prom_ins','t_prom_entre_ins','prop_wifi_ins']
g_insev2.columns = ['cant_sess','cant_ev','cant_ins','t_1er_ins','t_prom_ins','t_prom_entre_ins','prop_wifi_ins']
g_insev3.columns = ['cant_sess','cant_ev','cant_ins','t_1er_ins','t_prom_ins','t_prom_entre_ins','prop_wifi_ins']
g_insev4.columns = ['cant_sess','cant_ev','cant_ins','t_1er_ins','t_prom_ins','t_prom_entre_ins','prop_wifi_ins']
g_insev5.columns = ['cant_sess','cant_ev','cant_ins','t_1er_ins','t_prom_ins','t_prom_entre_ins','prop_wifi_ins']

In [113]:
g_insev1.t_1er_ins.fillna(value=259200,inplace=True)
g_insev1.t_prom_ins.fillna(value=259200,inplace=True)
g_insev1.t_prom_entre_ins.fillna(value=259200,inplace=True)
g_insev1.prop_wifi_ins.fillna(value=0,inplace=True)

g_insev2.t_1er_ins.fillna(value=259200,inplace=True)
g_insev2.t_prom_ins.fillna(value=259200,inplace=True)
g_insev2.t_prom_entre_ins.fillna(value=259200,inplace=True)
g_insev2.prop_wifi_ins.fillna(value=0,inplace=True)

g_insev3.t_1er_ins.fillna(value=259200,inplace=True)
g_insev3.t_prom_ins.fillna(value=259200,inplace=True)
g_insev3.t_prom_entre_ins.fillna(value=259200,inplace=True)
g_insev3.prop_wifi_ins.fillna(value=0,inplace=True)

g_insev4.t_1er_ins.fillna(value=259200,inplace=True)
g_insev4.t_prom_ins.fillna(value=259200,inplace=True)
g_insev4.t_prom_entre_ins.fillna(value=259200,inplace=True)
g_insev4.prop_wifi_ins.fillna(value=0,inplace=True)

g_insev5.t_1er_ins.fillna(value=259200,inplace=True)
g_insev5.t_prom_ins.fillna(value=259200,inplace=True)
g_insev5.t_prom_entre_ins.fillna(value=259200,inplace=True)
g_insev5.prop_wifi_ins.fillna(value=0,inplace=True)


In [114]:
g_insev1.cant_sess = g_insev1['cant_sess'].astype(np.int64)
g_insev1.t_1er_ins = g_insev1['t_1er_ins'].astype(np.int64)
g_insev1.t_prom_ins = g_insev1['t_prom_ins'].astype(np.int64)
g_insev1.t_prom_entre_ins = g_insev1['t_prom_entre_ins'].astype(np.int64)
g_insev1.prop_wifi_ins = (g_insev1['prop_wifi_ins']*100).astype(np.int64)
g_insev1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165213 entries, 41863526108385 to 9223314893254762361
Data columns (total 7 columns):
cant_sess           165213 non-null int64
cant_ev             165213 non-null int64
cant_ins            165213 non-null int64
t_1er_ins           165213 non-null int64
t_prom_ins          165213 non-null int64
t_prom_entre_ins    165213 non-null int64
prop_wifi_ins       165213 non-null int64
dtypes: int64(7)
memory usage: 10.1 MB


In [115]:
g_insev1.head()

Unnamed: 0_level_0,cant_sess,cant_ev,cant_ins,t_1er_ins,t_prom_ins,t_prom_entre_ins,prop_wifi_ins
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
41863526108385,0,88,2,76310,76473,326,0
161514654074162,0,8,0,259200,259200,259200,0
186034136943920,0,13,1,60042,60042,60042,70
360710529886978,0,3,0,259200,259200,259200,0
365882020742330,0,57,0,259200,259200,259200,0


### Creo arrays para regresion lineal

In [116]:
array_insev1 = insev1.values
array_insev2 = insev2.values
array_insev3 = insev3.values
array_insev4 = insev4.values
array_insev5 = insev5.values

In [117]:
X_insev1 = array_insev1[:, :-1]
y_insev1 = array_insev1[:, -1]
X_insev2 = array_insev2[:, :-1]
y_insev2 = array_insev2[:, -1]
X_insev3 = array_insev3[:, :-1]
y_insev3 = array_insev3[:, -1]
X_insev4 = array_insev4[:, :-1]
y_insev4 = array_insev4[:, -1]
X_insev5 = array_insev5[:, :-1]
y_insev5 = array_insev5[:, -1]


In [124]:
y_insev1

array([    nan, 259200.,     nan, ..., 259200., 259200., 259200.])

### Hago el fit

In [119]:
reg_insev1 = LinearRegression().fit(X_insev1, y_insev1)
reg_insev2 = LinearRegression().fit(X_insev2, y_insev2)
reg_insev3 = LinearRegression().fit(X_insev3, y_insev3)
reg_insev4 = LinearRegression().fit(X_insev4, y_insev4)
reg_insev5 = LinearRegression().fit(X_insev5, y_insev5)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Obtengo Score

In [73]:
reg_insev1.score(X_insev1, y_insev1)

In [76]:
reg_insev1.coef_

In [98]:
reg_insev2.coef_

array([-5.85696566e+00, -1.41389940e+01, -3.73757482e+03,  9.33396781e-01])

### En esa regresión, se da mucha importancia al feature 3, es decir la cantidad de installs.

In [80]:
#prediccion_insev1_en2 = reg_insev1.predict(X_insev2)

In [102]:
scoresSobreVentanaSiguiente = [reg_insev1.score(X_insev2, y_insev2),\
                               reg_insev2.score(X_insev3, y_insev3),\
                               reg_insev3.score(X_insev4, y_insev4),\
                               reg_insev4.score(X_insev5, y_insev5)]

In [103]:
scoresSobreVentanaSiguiente

[0.9020125359249261, 0.9063672310274357, 0.919034337499054, 0.9191289490852438]

## Con XGBoost

In [104]:
xgb_insev1 = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')
xgb_insev2 = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')
xgb_insev3 = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')
xgb_insev4 = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')
xgb_insev5 = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, objective='reg:squarederror')

In [105]:
xgb_insev1.fit(X_insev1, y_insev1)
xgb_insev2.fit(X_insev2, y_insev2)
xgb_insev3.fit(X_insev3, y_insev3)
xgb_insev4.fit(X_insev4, y_insev4)
xgb_insev5.fit(X_insev5, y_insev5)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.08, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=0.75, verbosity=1)

In [106]:
predict1_2 = xgb_insev1.predict(X_insev2)
predict2_3 = xgb_insev2.predict(X_insev3)
predict3_4 = xgb_insev3.predict(X_insev4)
predict4_5 = xgb_insev4.predict(X_insev5)

In [107]:
xgbScoresSobreVentanaSiguiente = [explained_variance_score(predict1_2,y_insev2),\
                                  explained_variance_score(predict2_3,y_insev3),\
                                  explained_variance_score(predict3_4,y_insev4),\
                                  explained_variance_score(predict4_5,y_insev5)]


In [94]:
#print(explained_variance_score(predictVentanaUnoSelf,y_insev1))

0.9674227884767749


In [108]:
xgbScoresSobreVentanaSiguiente

[0.950279878779768, 0.9527291356692622, 0.9530554877670591, 0.967660875441266]

### TODO LO SIGUIENTE NO LO TOQUE, esta como lo dejo yorick

### Ahora recompilamos nuestros resultados con los device_id correspondientes.

In [164]:
dfPredictVentanaUnoSobreVentanaDos = pd.DataFrame(predict1_2, columns = ['prediccion_1'])
dfPredictVentanaDosSobreVentanaTres = pd.DataFrame(predict2_3, columns = ['prediccion_2'])
dfPredictVentanaTresSobreVentanaCuatro = pd.DataFrame(predict3_4, columns = ['prediccion_3'])
dfPredictVentanaCuatroSobreVentanaCinco = pd.DataFrame(predict4_5, columns = ['prediccion_4'])

In [165]:
dfPredictVentanaDosSobreVentanaTres.head()

Unnamed: 0,prediccion_2
0,259110.15625
1,259110.15625
2,158814.703125
3,259110.15625
4,259110.15625


In [174]:
dfPrediccionVentanaDosRegresionA = insev2[['ref_hash']]
dfPrediccionVentanaTresRegresionA = insev3[['ref_hash']]
dfPrediccionVentanaCuatroRegresionA = insev4[['ref_hash']]
dfPrediccionVentanaCincoRegresionA = insev5[['ref_hash']]

In [175]:
dfPrediccionVentanaDosRegresionA = dfPrediccionVentanaDosRegresionA.reset_index(drop = True)
dfPrediccionVentanaTresRegresionA = dfPrediccionVentanaTresRegresionA.reset_index(drop = True)
dfPrediccionVentanaCuatroRegresionA = dfPrediccionVentanaCuatroRegresionA.reset_index(drop = True)
dfPrediccionVentanaCincoRegresionA = dfPrediccionVentanaCincoRegresionA.reset_index(drop = True)

In [176]:
dfPrediccionVentanaTresRegresionA.head()

Unnamed: 0,ref_hash
0,40621409780134
1,41863526108385
2,161514654074162
3,168103949904656
4,186034136943920


In [177]:
dfPrediccionVentanaDosRegresionA = pd.concat([dfPrediccionVentanaDosRegresionA,\
                                             dfPredictVentanaUnoSobreVentanaDos], axis = 1)
dfPrediccionVentanaTresRegresionA = pd.concat([dfPrediccionVentanaTresRegresionA,\
                                             dfPredictVentanaDosSobreVentanaTres], axis = 1)
dfPrediccionVentanaCuatroRegresionA = pd.concat([dfPrediccionVentanaCuatroRegresionA,\
                                             dfPredictVentanaTresSobreVentanaCuatro], axis = 1)
dfPrediccionVentanaCincoRegresionA = pd.concat([dfPrediccionVentanaCincoRegresionA,\
                                             dfPredictVentanaCuatroSobreVentanaCinco], axis = 1)

In [178]:
dfPrediccionVentanaCincoRegresionA.head(5)

Unnamed: 0,ref_hash,prediccion_4
0,40621409780134,259113.359375
1,69039685746313,259113.359375
2,90072729247980,259113.359375
3,161514654074162,100249.804688
4,168103949904656,259113.359375


### Ahora mergeamos los cuatro dataframes para tener el promedio asociado a cada modelo en una columna distinta. Despues de so, haremos un promedio de las predicciones y lo tomaremos como Survival Time para un install.

In [291]:
dfPrediccionSc = pd.merge(dfPrediccionVentanaDosRegresionA, dfPrediccionVentanaTresRegresionA,\
                        on = 'ref_hash', how = 'outer')

In [292]:
dfPrediccionSc = pd.merge(dfPrediccionSc, dfPrediccionVentanaCuatroRegresionA,\
                        on = 'ref_hash', how = 'outer')
dfPrediccionSc = pd.merge(dfPrediccionSc, dfPrediccionVentanaCincoRegresionA,\
                        on = 'ref_hash', how = 'outer')

In [235]:
len(dfPrediccionSc)

273948

In [293]:
dfPrediccionSc.head(5)

Unnamed: 0,ref_hash,prediccion_1,prediccion_2,prediccion_3,prediccion_4
0,40621409780134,259119.390625,259110.15625,259114.703125,259113.359375
1,41863526108385,259119.390625,259110.15625,,
2,161514654074162,259126.875,158814.703125,193308.0625,100249.804688
3,360710529886978,259119.390625,259110.15625,259114.703125,259113.359375
4,365882020742330,259119.390625,259110.15625,259114.703125,259113.359375


In [294]:
dfPrediccionSc = dfPrediccionSc.fillna(value = 259200)

In [295]:
dfPrediccionSc.head()

Unnamed: 0,ref_hash,prediccion_1,prediccion_2,prediccion_3,prediccion_4
0,40621409780134,259119.390625,259110.15625,259114.703125,259113.359375
1,41863526108385,259119.390625,259110.15625,259200.0,259200.0
2,161514654074162,259126.875,158814.703125,193308.0625,100249.804688
3,360710529886978,259119.390625,259110.15625,259114.703125,259113.359375
4,365882020742330,259119.390625,259110.15625,259114.703125,259113.359375


In [296]:
dfPrediccionSc['prediccion_promedia'] = 0.25*(dfPrediccionSc.prediccion_1 + dfPrediccionSc.prediccion_2 +\
dfPrediccionSc.prediccion_3 + dfPrediccionSc.prediccion_4)

In [297]:
dfPrediccionSc.head()

Unnamed: 0,ref_hash,prediccion_1,prediccion_2,prediccion_3,prediccion_4,prediccion_promedia
0,40621409780134,259119.390625,259110.15625,259114.703125,259113.359375,259114.40625
1,41863526108385,259119.390625,259110.15625,259200.0,259200.0,259157.390625
2,161514654074162,259126.875,158814.703125,193308.0625,100249.804688,177874.859375
3,360710529886978,259119.390625,259110.15625,259114.703125,259113.359375,259114.40625
4,365882020742330,259119.390625,259110.15625,259114.703125,259113.359375,259114.40625


In [298]:
dfPrediccionSc = dfPrediccionSc[['ref_hash', 'prediccion_promedia']]

### Ahora vamos a completar con todos los ref_hash que nunca aparecieron en nuestro estudio pero que son parte del estudio.

In [299]:
dfPrediccionSc['ref_hash'] = dfPrediccionSc.ref_hash.astype(str) + '_sc'

In [300]:
dfPrediccionSc.head(3)

Unnamed: 0,ref_hash,prediccion_promedia
0,40621409780134_sc,259114.40625
1,41863526108385_sc,259157.390625
2,161514654074162_sc,177874.859375


In [255]:
target = pd.read_csv('./target_competencia_ids.csv')

In [191]:
target.dtypes

ref_hash    object
obj          int64
dtype: object

In [256]:
target.head(3)

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,0
1,1000169251625791246_st,0
2,1000395625957344683_sc,0


In [316]:
targetUpdated = pd.merge(target, dfPrediccionSc, on = 'ref_hash', how = 'left')

In [317]:
targetUpdated.head(15)

Unnamed: 0,ref_hash,obj,prediccion_promedia
0,1000169251625791246_sc,0,259114.40625
1,1000169251625791246_st,0,
2,1000395625957344683_sc,0,161908.0
3,1000395625957344683_st,0,
4,1003027494996471685_sc,0,182100.3125
5,1003027494996471685_st,0,
6,1006670001679961544_sc,0,195578.828125
7,1006670001679961544_st,0,
8,1007573308966476713_sc,0,259114.40625
9,1007573308966476713_st,0,


### Si nunca apareció en nuestro analisis, le ponemos un valor de tres dias.

In [318]:
targetUpdated =  targetUpdated.fillna(value = 259200)

In [319]:
targetUpdated = targetUpdated.drop(columns = ['obj'])
targetUpdated.columns = ['ref_hash', 'obj']

### Consideremos que los valores muy cerca de 259200 segundos son una aproximación para tres días asi las cambiamos.

In [320]:
targetUpdated.loc[targetUpdated.obj > 255000, 'obj'] = 259200

### Dejando los valores de St(d) a 0 por ahora

In [321]:
targetUpdated.loc[targetUpdated.index % 2 != 0, 'obj'] = 0

In [322]:
targetUpdated.dtypes

ref_hash     object
obj         float64
dtype: object

In [323]:
targetUpdated['obj'] = targetUpdated.obj.round().astype(int)

In [324]:
targetUpdated.head(20)

Unnamed: 0,ref_hash,obj
0,1000169251625791246_sc,259200
1,1000169251625791246_st,0
2,1000395625957344683_sc,161908
3,1000395625957344683_st,0
4,1003027494996471685_sc,182100
5,1003027494996471685_st,0
6,1006670001679961544_sc,195579
7,1006670001679961544_st,0
8,1007573308966476713_sc,259200
9,1007573308966476713_st,0


In [325]:
targetUpdated.to_csv('targetConSc.csv', encoding='utf-8', index=False)