In [1]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast
from ydata_profiling import ProfileReport

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

In [2]:
df = read_bq("""
WITH click_on_button AS (SELECT user_id,
                                name,
                                t1.city_id,
                                t2.city_name,
                                t1.country_id,
                                t2.country_name,
                                event_dt_part,
                                client_time_ts
                         FROM indriver-e6e40.ods_event_tracker.event t1
                                  JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                                       ON
                                           t1.city_id = t2.city_id
                         WHERE name = 'city.client.create_order.click'
                           AND event_dt_part BETWEEN '2025-01-01' AND '2025-04-01'
                         QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY client_time_ts ASC) = 1),
     rides AS (SELECT user_id,
                      country_id,
                      metric_date_utc,
               FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
               WHERE 1 = 1
                 AND user_type = 'pass'
                 AND orders_count > 0
                 AND metric_date_utc >= '2024-01-01')
SELECT *,
       SUM(users) OVER (PARTITION BY monthly)                                                     AS total_by_date,
       SUM(users) OVER (PARTITION BY segment, monthly)                                            AS by_segment,
       (SUM(users) OVER (PARTITION BY segment, monthly) / SUM(users) OVER (PARTITION BY monthly)) AS share_of_impact
FROM (SELECT city_name,
             CASE
                 WHEN city_id IN
                      (6587, 4230, 5495, 4272, 4396, 4155, 4825, 5291, 4234, 4404, 5548, 4143, 4198, 4225, 4227, 4255,
                       4197, 4243, 5483, 4518, 4377, 4532, 4521, 4537, 4758, 4163, 4534, 4519, 4375) THEN 'test_cities'
                 ELSE 'other_cities'
                 END                             segment,
             country_name,
             DATE_TRUNC(event_dt_part, MONTH) AS monthly,
             COUNT(DISTINCT t1.user_id)       AS users
      FROM click_on_button t1
               LEFT JOIN rides t2
                         ON t1.user_id = t2.user_id
                             AND t2.metric_date_utc < t1.event_dt_part
      WHERE t2.metric_date_utc IS NULL
      GROUP BY 1, 2, 3, 4)
""")

df.head()

Unnamed: 0,city_name,segment,country_name,monthly,users,total_by_date,by_segment,share_of_impact
0,Barranquilla,test_cities,Colombia,2025-03-01,34126,4320948,491339,0.11
1,Manta,test_cities,Ecuador,2025-03-01,3539,4320948,491339,0.11
2,Rio de Janeiro,test_cities,Brazil,2025-03-01,17488,4320948,491339,0.11
3,Mendoza,test_cities,Argentina,2025-03-01,136,4320948,491339,0.11
4,Chihuahua,test_cities,Mexico,2025-03-01,6116,4320948,491339,0.11


In [6]:
# df.groupby(['segment', 'monthly'], as_index=False)[['users', 'total_by_date', 'by_segment', 'share_of_impact']].agg('max')

df.pivot_table(columns='segment', index=['monthly'], values=['users', 'total_by_date', 'by_segment', 'share_of_impact'], aggfunc='max')

Unnamed: 0_level_0,by_segment,by_segment,share_of_impact,share_of_impact,total_by_date,total_by_date,users,users
segment,other_cities,test_cities,other_cities,test_cities,other_cities,test_cities,other_cities,test_cities
monthly,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2025-01-01,4074837,447771,0.9,0.1,4522608,4522608,265705,75078
2025-02-01,3833586,431664,0.9,0.1,4265250,4265250,257343,76044
2025-03-01,3829609,491339,0.89,0.11,4320948,4320948,235922,87237
2025-04-01,179473,13106,0.93,0.07,192579,192579,15738,2253
