In [2]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

In [3]:
df = read_bq("""
WITH cpf_raw AS (SELECT DISTINCT e.app_version,
                                 e.user,
                                 e.city_id,
                                 e.user_id,
                                 e.name                                                             event_name,
                                 CAST(JSON_EXTRACT_SCALAR(payload, '$.registration') AS boolean) AS registration_value,
                                 JSON_EXTRACT_SCALAR(payload, '$.check_type')                    AS check_type_value,
                                 e.event_dt_part,
                                 e.is_new_order,
                                 os_name,
                                 city_name,
                                 country_id,
                                 country_name,
                                 CASE
                                     WHEN e.name = 'city.client.id_doc_check.show' THEN 1
                                     END
                                                                                                    count_show_cpf,
                                 CASE
                                     WHEN e.name = 'city.client.id_doc_check.show' THEN TIMESTAMP_MILLIS(e.client_time)
                                     END
                                                                                                    time_cpf_show,
                                 CASE
                                     WHEN e.name = 'city.client.id_doc_check_confirmed.show' THEN 1
                                     END
                                                                                                    count_confirmed_cpf,
                                 CASE
                                     WHEN e.name = 'city.client.id_doc_check_confirmed.show'
                                         THEN TIMESTAMP_MILLIS(e.client_time)
                                     END
                                                                                                    time_confirmed_cpf,
                                 CASE
                                     WHEN e.name = 'city.client.id_doc_check_duplicated.show' THEN 1
                                     END
                                                                                                    count_duplicated_cpf
                 FROM `indriver-e6e40.emart.swrve_event` e
                          JOIN
                      `indriver-bi.heap.vw_macroregion_mapping` AS m
                      ON
                          e.city_id = m.city_id
                 WHERE e.event_dt_part >= ('2024-07-01') -- start of updated cpf events
                   AND e.name IN ('city.client.id_doc_check.show',
                                  'city.client.id_doc_check_confirmed.show',
                                  'city.client.id_doc_check_duplicated.show')
                   AND m.country_id = 11 --sorting of Brazil,cpf checks only
)

SELECT city_name,
       country_name,
       DATE_TRUNC(event_dt_part, WEEK) AS weekly,
       SUM(count_show_cpf)             AS count_show_cpf,
       SUM(count_confirmed_cpf)        AS count_confirmed_cpf
FROM cpf_raw
-- WHERE city_id = 4142
GROUP BY 1, 2, 3
""")

df.head()

Unnamed: 0,city_name,country_name,weekly,count_show_cpf,count_confirmed_cpf
0,Uberaba,Brazil,2024-08-18,1014,714
1,João Pessoa,Brazil,2024-08-18,1167,337
2,Recife,Brazil,2024-08-18,9228,3674
3,Fortaleza,Brazil,2024-08-18,1696,1282
4,Dourados,Brazil,2024-08-18,642,467


In [11]:
px.line(
    df.groupby(['weekly'], as_index=False)[['count_confirmed_cpf', 'count_show_cpf']].sum(),
    y=['count_confirmed_cpf', 'count_show_cpf'],
    x='weekly')