In [107]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [108]:
import pandas as pd
import numpy as np

installs = pd.read_csv(r'Data/installs.csv.gzip',compression = 'gzip')

In [109]:
types = {'date':'object','event_id':'int64','ref_type':'category','ref_hash':'int64','application_id':'int64',\
         'attributed':'bool','device_countrycode':'category','device_os_version':'float64','device_brand':'float64',\
         'device_model':'float64','device_city':'float64','session_user_agent':'float64','trans_id':'category',\
         'user_agent':'float64','event_uuid':'object','carrier':'float64','kind':'float64','device_os':'category',\
         'wifi':'bool','connection_type':'category','ip_address':'int64','device_language':'float64'}

cols = ['date','event_id','ref_hash','application_id','attributed','device_brand','device_model'
       ,'event_uuid','carrier','kind','device_os','wifi']

events = pd.read_csv(r'Data/events.csv.gzip',compression = 'gzip', dtype = types, usecols = cols)

In [131]:

#PRE: 
#df['timestamp'] = timestamp de una fecha
#date fecha a comparar
#refhash para filtrar los events
#deltaTime para determinar el margen de aceptacion para exactmatch [Real+]

#POS:
#devuelve 2 dataframes, para checkear si estan vacios usar df.empty

def find_lowest_neighbour_in_time(df,date,refhash,useDeltaTime=False,deltaTime=0):
    df = df[df['ref_hash'] == refhash]
    exactmatch=df[(df['date']==date) | 
                  ((useDeltaTime) & 
                   ((date.timestamp()- df['timestamp'])< deltaTime) &
                  ((date.timestamp()- df['timestamp'])> 0))] # quiero solo los eventos previos a date
    lowerneighbour_ind = df[df['timestamp']<date.timestamp()].nlargest(1,'timestamp')
    return lowerneighbour_ind,exactmatch

In [111]:
events['date'] = pd.to_datetime(events['date'])

installs['created'] = pd.to_datetime(installs['created'])
installs['hour'] = installs['created'].apply(lambda x: x.hour)
installs['hour'] = installs['hour'].astype('int8')

installs['day'] = installs['created'].apply(lambda x: x.day)
installs['day'] = installs['day'].astype('int8')

installs['install_value'] = 1

In [112]:
installs['origin_event_uuid'] = None
installs['exact_event_uuid'] = None

In [122]:
events['timestamp'] = events['date'].apply(lambda x: x.timestamp())
events['timestamp'] = events['timestamp'].astype('float32')
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2494423 entries, 0 to 2494422
Data columns (total 13 columns):
date              datetime64[ns]
event_id          int64
ref_hash          int64
application_id    int64
attributed        bool
device_brand      float64
device_model      float64
event_uuid        object
carrier           float64
kind              float64
device_os         category
wifi              object
timestamp         float32
dtypes: bool(1), category(1), datetime64[ns](1), float32(1), float64(4), int64(3), object(2)
memory usage: 204.6+ MB


In [114]:
installsByTime = installs.groupby('created').agg({'install_value':sum,'application_id':lambda x: x,'ref_hash':lambda x: x})
installsDevice = installs.groupby('ref_hash').agg({'install_value':sum,'application_id':lambda x: x,'created':lambda x: x})
installsByTime.head()

Unnamed: 0_level_0,install_value,application_id,ref_hash
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-03-05 00:00:38.219,1,5,5924583283197158397
2019-03-05 00:05:35.234,1,16,1740562126672148647
2019-03-05 00:07:25.431,1,16,760677655438420566
2019-03-05 00:07:38.572,1,8,5006759053482448963
2019-03-05 00:10:36.708,1,7,3318538943125451633


In [None]:
#Agrego los eventos de origen y exactos a events

In [151]:
for date in installsByTime.index.tolist():
    neighbour,exact = find_lowest_neighbour_in_time(events,date,installsByTime.loc[date,'ref_hash'],True,0.1)
    if neighbour.empty == False:
        install = installs[installs['created'] == date]
        value = neighbour.loc[neighbour.index[0],'event_uuid']
        installs.loc[install.index[0],'origin_event_uuid'] = value
    if exact.empty == False:
        install = installs[installs['created'] == date]
        value = exact.loc[exact.index[0],'event_uuid']
        installs.loc[install.index[0],'exact_event_uuid'] = value
        

In [152]:
instsource = installs.dropna(subset = ['origin_event_uuid'])
instexact = installs.dropna(subset = ['exact_event_uuid'])

In [153]:
#lista de apps con evento exacto
instexact['application_id'].unique()

array([ 0,  2,  5,  8,  9, 12, 15, 17, 21, 10, 24, 20,  7, 30, 16,  6, 33,
       29, 34])

In [119]:
#mi teoria es que las siguientes apps dejan marcada la instalacion como un evento
#para desprobarla debo buscar casos donde:
# 1) existan instalaciones no implicitas de las apps de la lista que dejen constancia del evento
# 2) existan instalaciones de las apps en la lista que no dejen constancia del evento
# 3) existan apps fuera de la lista que tengan constancia del evento

#por la forma en la que forme la lista (usando dropna), la tercera esta descartada.

In [154]:
instexact['implicit'].unique()

array([ True])

In [None]:
#Con esto demuestro que no hay instalaciones no implicitas que dejen constancia y que

In [155]:
installs_that_break_hypothesis = installs.loc[(installs['application_id'].isin(instexact['application_id'].unique())) & 
                  (installs['exact_event_uuid'] == None)]
installs_that_break_hypothesis

Unnamed: 0,created,application_id,ref_type,ref_hash,click_hash,attributed,implicit,device_countrycode,device_brand,device_model,...,kind,wifi,trans_id,ip_address,device_language,hour,day,install_value,origin_event_uuid,exact_event_uuid


In [None]:
#Con esto demuestro que no hay instalaciones de las apps que no dejen una constancia exacta

In [None]:
#Ahora, pruebo lo mismo dejando un pequeño margen de tiempo

array([ 0,  2,  5,  8,  9, 12, 15, 17, 21, 10, 24, 20,  7, 30, 16,  6, 33,
       29, 34])