# Packages

In [1]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.4f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from sklearn.linear_model import Ridge
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error


# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

# Analysis

In [None]:
df = read_bq("""
WITH curr AS (SELECT metric_date_utc,
                     FORMAT_DATE('%m-%d', metric_date_utc) AS month_day,
                     COUNT(DISTINCT user_id)               AS users,
                     SUM(tenders_count)                    AS tenders,
                     SUM(orders_count)                     AS orders,
                     SUM(rides_count)                      AS rides,
                     SUM(gmv_unclean_usd)                  AS gmv
              FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
              WHERE user_type = 'pass'
                AND metric_date BETWEEN '2025-05-01' AND '2025-06-01'
                AND country_id = 11
              GROUP BY 1, 2),
     prev AS (SELECT metric_date_utc,
                     FORMAT_DATE('%m-%d', metric_date_utc) AS month_day,
                     COUNT(DISTINCT user_id)               AS users,
                     SUM(tenders_count)                    AS tenders,
                     SUM(orders_count)                     AS orders,
                     SUM(rides_count)                      AS rides,
                     SUM(gmv_unclean_usd)                  AS gmv
              FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
              WHERE user_type = 'pass'
                AND metric_date BETWEEN '2024-05-01' AND '2024-06-01'
                AND country_id = 11
              GROUP BY 1, 2)
SELECT *,
       LAG(t1.rides) OVER (ORDER BY t1.month_day)            AS prev_value_25,
       t1.rides - LAG(t1.rides) OVER (ORDER BY t1.month_day) AS diff_25,
       LAG(t2.rides) OVER (ORDER BY t1.month_day)            AS prev_value_24,
       t2.rides - LAG(t2.rides) OVER (ORDER BY t1.month_day) AS diff_24
FROM curr t1
         JOIN prev t2 ON t1.month_day = t2.month_day

""")

df['avg_by_user_25'] = df['rides']/df['users']
df['avg_by_user_24'] = df['rides_1']/df['users_1']
df['metric_date_utc'] = pd.to_datetime(df['metric_date_utc'])

df.head()

Unnamed: 0,metric_date_utc,month_day,users,tenders,orders,rides,gmv,metric_date_utc_1,month_day_1,users_1,tenders_1,orders_1,rides_1,gmv_1,prev_value_25,diff_25,prev_value_24,diff_24
0,2025-05-22,05-22,312556,1407698,639226,313479,928370.8567,2024-05-22,05-22,263987,1283644,508419,269147,873546.165,303660,9819,258574,10573
1,2025-05-07,05-07,360572,1462038,782910,347764,1053109.0809,2024-05-07,05-07,332410,1298501,713172,300389,1023121.191,323184,24580,298650,1739
2,2025-06-01,06-01,367086,1229967,837064,303322,1069864.6523,2024-06-01,06-01,319705,1410746,648960,303606,1041606.5989,354041,-50719,278529,25077
3,2025-05-19,05-19,338232,1326135,717157,309870,978494.4474,2024-05-19,05-19,327047,1279058,698040,276976,1092195.9773,298250,11620,326066,-49090
4,2025-05-26,05-26,328501,1332975,691046,307362,954606.0724,2024-05-26,05-26,310897,1254379,661036,263305,1001023.3056,298002,9360,304316,-41011


In [22]:
df["Weekly"] = df["metric_date_utc"].dt.to_period("W").dt.to_timestamp()

In [26]:
df[df['metric_date_utc']<='2025-06-01'].groupby(['Weekly'], as_index=False)[['rides', 'rides_1', 'gmv', 'gmv_1', 'users', 'users_1', 'diff_25', 'diff_24', 'avg_by_user_25', 'avg_by_user_24']].mean()

Unnamed: 0,Weekly,rides,rides_1,gmv,gmv_1,users,users_1,diff_25,diff_24,avg_by_user_25,avg_by_user_24
0,2025-04-28,301206.0,290972.25,989437.0131,1013801.8872,324524.25,320280.5,23036.3333,41213.3333,0.9265,0.9052
1,2025-05-05,361322.2857,315440.2857,1189408.041,1112151.0335,407697.7143,362987.2857,6823.1429,2699.2857,0.8964,0.8701
2,2025-05-12,315694.5714,290776.1429,1013996.2133,1037190.6035,341328.0,337800.2857,-7557.4286,-4170.8571,0.9282,0.8853
3,2025-05-19,314345.1429,279668.1429,980142.0559,965280.0295,327244.7143,294416.0,-35.4286,-3107.1429,0.9624,0.9537
4,2025-05-26,313967.0,268182.1429,980627.1065,919609.2565,332343.8571,291510.7143,760.0,-101.4286,0.951,0.9217


In [32]:
3387/315694*100

1.072874365683225

In [30]:
fig = px.line(
    df[df['metric_date_utc']<='2025-06-01'].groupby(['metric_date_utc'], as_index=False)[['rides', 'rides_1', 'users', 'users_1', 'diff_25', 'diff_24', 'avg_by_user_25', 'avg_by_user_24']].sum(), 
    x="metric_date_utc", 
    y=['rides', 'rides_1']
    )
fig.show()

In [None]:
df_1 = read_bq("""
WITH cte AS (SELECT order_uuid,
                    user_id    AS pass_id,
                    driver_id,
                    city_id    AS order_city_id,
                    country_id AS order_country_id,
                    status_order,
                    tender_sk,
                    order_timestamp,
                    at_pickup_dttm,
                    departed_pickup_dttm,
                    at_destination_dttm,
                    departed_destination_dttm,
                    driveraccept_timestamp,
                    driverarrived_timestamp,
                    driverstarttheride_timestamp,
                    driverdone_timestamp,
                    clientdone_timestamp,
                    clientcancel_timestamp,
                    drivercancel_timestamp,
                    user_reg_date,
                    driver_reg_date,
                    stage,
                    created_date_order_part,
                    duration_in_seconds
             FROM indriver-e6e40.imart.incity_detail_new_order
             WHERE created_date_order_part BETWEEN '2025-05-01'
                       AND CURRENT_DATE())
SELECT created_date_order_part,
       COUNT(DISTINCT IF(min_accept IS NULL AND tenders = 0, pass_id, NULL)) AS with_tenders,
       COUNT(DISTINCT IF(min_accept IS NULL, pass_id, NULL))                 AS wo_tenders
FROM (SELECT pass_id,
             created_date_order_part,
             order_uuid,
             MIN(driveraccept_timestamp) AS min_accept,
             COUNT(tender_sk)            AS tenders
      FROM cte
      GROUP BY 1, 2, 3)
GROUP BY 1
""")

df_1['created_date_order_part'] = pd.to_datetime(df_1['created_date_order_part'])

df_1.head()

Unnamed: 0,created_date_order_part,with_tenders,wo_tenders
0,2025-05-26,1292171,2302594
1,2025-05-12,1530616,2567501
2,2025-05-21,882791,1846573
3,2025-05-18,1403606,2489601
4,2025-06-03,1370517,2422759


In [19]:
fig = px.line(
    df_1[df_1['created_date_order_part']<='2025-06-01'].groupby(['created_date_order_part'], as_index=False)[['with_tenders', 'wo_tenders']].sum(), 
    x="created_date_order_part", 
    y=['with_tenders', 'wo_tenders']
    )
fig.show()