# Packages

In [2]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from statsmodels.stats.power import NormalIndPower
from statsmodels.stats.proportion import proportion_effectsize
from math import ceil

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

# Data preperation

In [None]:
garage = """
WITH wd1_doc AS (SELECT *
                 FROM (SELECT dd.user_id,
                              dd.request_id,
                              title,
                              value
                       FROM `dwh-storage-327422.ods_watchdocs.document_data` AS dd
                                LEFT JOIN indriver-e6e40.ods_watchdocs.document_object AS do
                                          ON dd.document_object_id = do.id
                       WHERE 1 = 1
                         AND TRIM(do.title) IN (
                           'MSG_PHOTO_OF_YOUR_CAR' --  тут можно будет докинуть еще вариации, если они есть
                           )
                         AND dd.value != ''
                         AND dd.value IS NOT NULL
                       QUALIFY
                           ROW_NUMBER() OVER (PARTITION BY dd.user_id, dd.request_id, do.title ORDER BY CASE WHEN CAST(do.is_required AS STRING) = 'true' THEN 1 ELSE 0 END DESC,
                               CASE WHEN dd.status = 10 THEN 1 ELSE 0 END DESC,
                               COALESCE(dd.update_date, dd.create_date, TIMESTAMP("1900-01-01 00:00:00+00")) DESC,
                               COALESCE(dd.create_date, TIMESTAMP("1900-01-01 00:00:00+00")) DESC, dd.id DESC) = 1) AS q
                     PIVOT (ANY_VALUE(value) FOR title IN ('MSG_PHOTO_OF_YOUR_CAR')) -- сюда тоже их докинуть, чтобы развернуть в горизонтальную структуру
),
     wd1_req AS ( -- заявки вд1
         SELECT user_id,
                id,
                CASE WHEN state IN (20, 40) THEN 'approved' ELSE 'declined' END AS state -- упростил статусную модель
         FROM dwh-storage-327422.ods_watchdocs.user_request),
     profile AS ( -- профили вд2
         SELECT *
         FROM (SELECT dpg.*,
                      CAST(CONCAT('0x', SUBSTR(dpg.user_id, 25, 12)) AS INT64)   AS driver_id,
                      LOWER(REPLACE(dpg.status, 'DOCUMENT_PROFILE_STATUS_', '')) AS new_status,
                      nos.source_id
               FROM `indriver-e6e40.ods_watchdocs.document_profile_act_global` AS dpg
                        INNER JOIN `indriver-e6e40.mdict.new_order_source_nm_to_source_id` AS nos
                                   ON dpg.source_nm = nos.source_nm
                                       AND nos.table_nm = 'indriver-e6e40.ods_watchdocs.document_profile_global'
               WHERE 1 = 1) AS q
         WHERE 1 = 1
           AND q.driver_id NOT IN (259949038) -- тестовый юзер с слов бэка
         QUALIFY DENSE_RANK() OVER (PARTITION BY driver_id, service_id ORDER BY IF(source_id = 11, 0, 1) DESC) = 1)
        ,
     doc_profile AS ( -- документы профилей вд2
         SELECT *
         FROM (SELECT dpdg.*,
                      REPLACE(dpdg.status, 'DOCUMENT_PROFILE_DOCUMENT_STATUS_', '') AS new_status,
                      nos.source_id
               FROM `indriver-e6e40.ods_watchdocs.document_profile_documents_act_global` AS dpdg
                        INNER JOIN indriver-e6e40.mdict.new_order_source_nm_to_source_id AS nos
                                   ON dpdg.source_nm = nos.source_nm
                                       AND nos.table_nm =
                                           'indriver-e6e40.ods_watchdocs.document_profile_documents_global') AS q
         WHERE 1 = 1),
     wd2_doc AS ( -- документы вд2 со значениями
         SELECT document_id,
                value,
                document_kind_nm,
                field_kind_nm,
                nos.source_id
         FROM dwh-storage-327422.ods_watchdocs.wd2_document_data_act_global ddg
                  JOIN indriver-e6e40.mdict.new_order_source_nm_to_source_id AS nos ON nos.source_nm = ddg.source_nm
             AND nos.table_nm = 'dwh-storage-327422.ods_watchdocs.wd2_document_data_global'
         WHERE 1 = 1
           AND document_kind_nm = 'Vehicle information'
           AND field_kind_nm = 'Vehicle picture'),
     wd2_all AS ( -- собираем доки и профили в 1 месте
         SELECT *
         FROM (SELECT dp.document_id,
                      p.new_status      AS driver_status, -- здесь для вд2 статусы не упрощал, вывел что есть, они хотя бы строкой идут, а не цифрами
                      p.source_id,
                      p.updated_at      AS profile_update,
                      p.source_nm,
                      CASE
                          WHEN p.service_name LIKE '%appcity%' THEN 'APPLICATION_VERTICAL_CITY'
                          WHEN p.service_name LIKE '%courier%' THEN 'APPLICATION_VERTICAL_COURIER'
                          ELSE NULL END AS service_name,  -- это для упрощения, чтобы джойнить по вертикали, которая пишется в гараж, пока вертикалей в вд2 только 2, потом будет больше
                      wdd.value
               FROM profile p
                        LEFT JOIN doc_profile dp ON p.id = dp.document_profile_id
                   AND p.source_id = dp.source_id
                        LEFT JOIN wd2_doc wdd ON wdd.document_id = dp.document_id
                   AND wdd.source_id = dp.source_id)
         QUALIFY ROW_NUMBER() OVER (PARTITION BY document_id, service_name, source_nm ORDER BY profile_update DESC) =
                 1 -- один документ может быть на куче профилей, пришлось схлопывать до последнего обновленного (это не прям супер точная выборка получается, но в целом думаю ок)
     )
SELECT *
FROM (SELECT 'auto'                                                      AS import_type,
             driver_id                                                   AS user_id,
             uuid                                                        AS external_id,
             JSON_OBJECT("source_id", COALESCE(CAST(document_id AS string), CAST(request_id AS string), source_id),
                         "file_storage_id",
                         photo_car, "color", color, "model", model_name) AS metadata,
             COALESCE(t1.country_id, t2.country_id)                      AS country_id,
             agglomeration_id,
             COALESCE(t1.city_id, t2.city_id)                            AS city_id
      FROM (SELECT DISTINCT g.*,
                            wd1.request_id,
                            wd2.document_id,
                            COALESCE(wd2.value, wd1.MSG_PHOTO_OF_YOUR_CAR, wd2_mx.value, wd2_co.value,
                                     wd2_lt.value)         AS photo_car,
                            COALESCE(wd2.driver_status, wd1r.state, wd2_mx.driver_status, wd2_co.driver_status,
                                     wd2_lt.driver_status) AS driver_status
            FROM `indriver-e6e40.ods_garage.garage_vehicles_global` g
                     JOIN indriver-e6e40.mdict.new_order_source_nm_to_source_id AS nos
                          ON nos.source_nm = g.source_nm -- это нужно для того чтобы джойнить данные в нужных кластерах
                              AND nos.table_nm = 'indriver-e6e40.ods_garage.garage_vehicles_global'
                     LEFT JOIN wd1_doc wd1 ON g.source_id = CAST(wd1.request_id AS string)
                     LEFT JOIN wd1_req wd1r
                               ON CAST(wd1r.id AS string) = g.source_id -- это source_id который с гаража как документ
                     LEFT JOIN wd2_all wd2 ON wd2.document_id = g.source_id
                AND wd2.source_id = nos.source_id -- а этот для матчинга кластера (сори за путаницу)
                AND wd2.service_name = g.application_vertical

-- из-за прошлых миграций придется делать вот такой костыль ниже для 3 кластеров
                     LEFT JOIN wd2_all wd2_mx ON wd2_mx.document_id = g.source_id
                AND wd2_mx.source_nm =
                    'projects/709531919733/subscriptions/usea1-latam-mx.wd2-application.document-profile.v1.prod.bq_push'
                AND g.source_nm = 'projects/709531919733/subscriptions/euce1-cis.garage.vehicles.v1.prod.bq'
                AND wd2_mx.service_name = g.application_vertical
                     LEFT JOIN wd2_all wd2_co ON wd2_co.document_id = g.source_id
                AND wd2_co.source_nm =
                    'projects/709531919733/subscriptions/saea1-latam-co.wd2-application.document-profile.v1.prod.bq_push'
                AND g.source_nm = 'projects/709531919733/subscriptions/euce1-cis.garage.vehicles.v1.prod.bq'
                AND wd2_co.service_name = g.application_vertical
                     LEFT JOIN wd2_all wd2_lt ON wd2_lt.document_id = g.source_id
                AND wd2_lt.source_nm =
                    'projects/709531919733/subscriptions/saea1-latam.wd2-application.document-profile.v1.prod.bq_push'
                AND g.source_nm = 'projects/709531919733/subscriptions/euce1-cis.garage.vehicles.v1.prod.bq'
                AND wd2_lt.service_name = g.application_vertical) t1
               LEFT JOIN dwh-storage-327422.personal_data.tbl_user_act t2 ON t1.driver_id = t2.id
      WHERE driver_status = 'approved'
        AND photo_car IS NOT NULL
        AND type_id = 1
        AND t1.country_id IN (SELECT DISTINCT (cr.country_id)
                              FROM `dwh-storage-327422.emart_cdp.dim_cities` cr
                              WHERE cr.region_nm = 'saea1-latam-co')) -- choosing the region
"""


query_wd1 = """
WITH main_req AS
         (SELECT COALESCE(l.user_id, f.user_id)         AS driver_id,
                 COALESCE(l.user_request_id, f.id)      AS request_id,
                 COALESCE(l.create_date, f.update_date) AS cr_up_date,
                 f.country_id,
                 f.city_id
          FROM indriver-e6e40.ods_watchdocs.user_request_log l
                   FULL JOIN dwh-storage-327422.ods_watchdocs.user_request f ON l.user_request_id = f.id
          WHERE (l.action = 'accept' OR f.state IN (20, 40)) -- if there is a row with accepted record and 20,40 - request is approved
          QUALIFY
              ROW_NUMBER() OVER (PARTITION BY COALESCE(l.user_id, f.user_id) ORDER BY COALESCE(l.create_date, f.update_date) DESC) =
              1),
     grouped AS (SELECT main_req.*,
                        dd.document_object_id,
                        do.title,
                        dd.value,
                        COALESCE(dd.update_date, dd.create_date) AS doc_update_date
                 FROM main_req
                          JOIN dwh-storage-327422.watchdocs.document_data dd ON main_req.request_id = dd.request_id
                          JOIN indriver-e6e40.watchdocs.document_object do ON dd.document_object_id = do.id
                 WHERE do.title = 'PHOTO'
                   AND NOT dd.value LIKE 'https://%'
                   AND
                   AND main_req.country_id IN (SELECT DISTINCT(cr.country_id)
                                               FROM `dwh-storage-327422.emart_cdp.dim_cities` cr
                                               WHERE cr.region_nm LIKE '%{region}%'),
     res AS (SELECT 'contractor'                                                   AS import_type,
                    driver_id                                                      AS user_id,
                    driver_id                                                      AS external_id,
                    JSON_OBJECT('source_id', request_id, 'file_storage_id', VALUE) AS metadata,
                    country_id                                                     AS country_id,
                    0                                                              AS agglomeration_id,
                    city_id                                                        AS city_id
             FROM grouped),
     mc AS (SELECT user_id
            FROM `indriver-e6e40.ods_media_checker.entity` AS e
            WHERE source_nm LIKE '%{region}%'
              AND e.entity_type = 'contractor')
SELECT *
FROM res
WHERE res.user_id NOT IN (SELECT user_id FROM mc)
"""

query_wd2 = """
WITH raw AS (SELECT dd.document_id                                                                  AS source_id,
                    CAST(`indriver-e6e40.de_functions.convert_uuid_to_id`(dd.user_uuid) AS INTEGER) AS user_id,
                    dd.value                                                                        AS file_storage_id,
                    d.country_id                                                                    AS country_id,
                    d.state_id                                                                      AS agglomeration_id,
                    d.city_id                                                                       AS city_id,
                    dd.updated_at                                                                   AS updated_at
             FROM `dwh-storage-327422.ods_watchdocs.wd2_document_data_global` dd
                      INNER JOIN `indriver-e6e40.ods_watchdocs.document_global` d ON dd.document_id = d.id
             WHERE dd.field_kind_uuid = '5099bd41-2d67-11ef-a465-da0741c8f290'
               AND dd.source_nm like '%{region}%'
             QUALIFY ROW_NUMBER() OVER (PARTITION BY dd.user_uuid ORDER BY dd.updated_at DESC) = 1),
     res AS (SELECT 'contractor'                                                                AS import_type,
                    r.user_id                                                                   AS user_id,
                    r.user_id                                                                   AS external_id,
                    JSON_OBJECT("source_id", r.source_id, "file_storage_id", r.file_storage_id) AS metadata,
                    r.country_id                                                                AS country_id,
                    r.agglomeration_id                                                          AS agglomeration_id,
                    r.city_id                                                                   AS city_id
             FROM raw r
             WHERE r.updated_at >= '2025-06-17 05:50:17'
             ORDER BY r.user_id),
     mc AS (SELECT user_id
            FROM `indriver-e6e40.ods_media_checker.entity` AS e
            WHERE source_nm like '%{region}%'
              AND e.entity_type = 'contractor')
SELECT res.*
FROM res
WHERE res.user_id NOT IN (SELECT user_id FROM mc)

"""

In [6]:
import pandas as pd
import numpy as np
import os

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

sources = [
    'cis', 'africa', 
    'sea', 'sa', 'sa_in', 
    'eu', 'mena', 'mena_eg', 
    'latam', 'latam_br', 
    'latam_pe', 'latam_mx', 'latam_co', 'eu_central1'
    ]


for region in sources:

    print(f"Uploading the {region}")

    df_code = read_bq(f"""
WITH main_req AS
         (SELECT COALESCE(l.user_id, f.user_id)         AS driver_id,
                 COALESCE(l.user_request_id, f.id)      AS request_id,
                 COALESCE(l.create_date, f.update_date) AS cr_up_date,
                 f.country_id,
                 f.city_id
          FROM indriver-e6e40.ods_watchdocs.user_request_log l
                   FULL JOIN dwh-storage-327422.ods_watchdocs.user_request f ON l.user_request_id = f.id
          WHERE (l.action = 'accept' OR f.state IN (20, 40)) -- if there is a row with accepted record and 20,40 - request is approved
          QUALIFY
              ROW_NUMBER() OVER (PARTITION BY COALESCE(l.user_id, f.user_id) ORDER BY COALESCE(l.create_date, f.update_date) DESC) =
              1),
     grouped AS (SELECT main_req.*,
                        dd.document_object_id,
                        do.title,
                        dd.value,
                        COALESCE(dd.update_date, dd.create_date) AS doc_update_date
                 FROM main_req
                          JOIN dwh-storage-327422.watchdocs.document_data dd ON main_req.request_id = dd.request_id
                          JOIN indriver-e6e40.watchdocs.document_object do ON dd.document_object_id = do.id
                 WHERE do.title = 'PHOTO'
                   AND NOT dd.value LIKE 'https://%'
                   AND main_req.country_id IN (SELECT DISTINCT(cr.country_id)
                                               FROM `dwh-storage-327422.emart_cdp.dim_cities` cr
                                               WHERE cr.region_nm LIKE '%{region}%')),
     res AS (SELECT 'contractor'                                                   AS import_type,
                    driver_id                                                      AS user_id,
                    driver_id                                                      AS external_id,
                    JSON_OBJECT('source_id', request_id, 'file_storage_id', VALUE) AS metadata,
                    country_id                                                     AS country_id,
                    0                                                              AS agglomeration_id,
                    city_id                                                        AS city_id
             FROM grouped),
     mc AS (SELECT user_id
            FROM `indriver-e6e40.ods_media_checker.entity` AS e
            WHERE source_nm LIKE '%{region}%'
              AND e.entity_type = 'contractor')
SELECT *
FROM res
WHERE res.user_id NOT IN (SELECT user_id FROM mc)
    """)

    print(df_code.shape)
    
    df_code['metadata'] = df_code['metadata'].astype('str')
    df_code = df_code.drop_duplicates()
    folder_path = f'/Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD1_migration_to_mediachecker_lost_users/{region}'
    # folder_path = f'/Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD2_migration_to_mediachecker_lost_users/{region}'

    try:
        chunks = np.array_split(df_code, math.ceil(df_code.shape[0]/9000))
    
        for i, chunk in enumerate(chunks):

            os.makedirs(folder_path, exist_ok=True)

            file_name = f'{region}-{i}.csv'
            file_path = os.path.join(folder_path, file_name)

            chunk.to_csv(file_path, index=False)

        print(f"Files were saved to: {folder_path}")
    except ValueError:
        continue


Uploading the cis
(16096, 7)
Files were saved to: /Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD1_migration_to_mediachecker_lost_users/cis
Uploading the africa
(11165, 7)
Files were saved to: /Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD1_migration_to_mediachecker_lost_users/africa
Uploading the sea
(428830, 7)
Files were saved to: /Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD1_migration_to_mediachecker_lost_users/sea
Uploading the sa
(258328, 7)
Files were saved to: /Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD1_migration_to_mediachecker_lost_users/sa
Uploading the sa_in
(29867, 7)
Files were saved to: /Users/renatyunison/Library/CloudStorage/GoogleDrive-renat.yunisov@indriver.com/My Drive/WD1_migration_to_mediachecker_lost_users/sa_in
Uploading the eu
(15266, 7)
Files were saved to: /Users/renatyunison/L