In [1]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.')

In [2]:
df = read_bq("""
    WITH facechecker_status AS (SELECT user_id,
                                    type,
                                    status,
                                    modified
                                FROM dwh-storage-327422.photocontrol.events
                                WHERE 1 = 1
                                AND SOURCE = 'facechecker'
                                AND type IN ('TYPE_AUTO', 'TYPE_AVATAR')
                                QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY modified DESC) = 1),
        wd_status AS (SELECT user_id,
                            state,
                            transport_name,
                            previous_state,
                            CASE
                                WHEN state = 'New' AND previous_state = 'Approved' AND wd_version = 'wd1' THEN 'Approved'
                                WHEN state = 'Approved' THEN 'Approved'
                                ELSE NULL
                                END statuses,
                            event_date_utc_dttm,
                            create_date_utc_dttm
                    FROM (SELECT user_id,
                                    state,
                                    transport_name,
                                    wd_version,
                                    LAG(state)
                                        OVER (PARTITION BY user_id, transport_name ORDER BY event_date_utc_dttm) AS previous_state,
                                    event_date_utc_dttm,
                                    create_date_utc_dttm,
                            FROM indriver-bi.watchdocs.tbl_watchdocs_unified_requests_detail
                            QUALIFY
                                ROW_NUMBER() OVER (PARTITION BY user_id, transport_name ORDER BY event_date_utc_dttm DESC) =
                                1))
    SELECT *,
        CASE
            WHEN type = 'TYPE_AVATAR' THEN 2
            WHEN type = 'TYPE_AUTO' THEN 1
            ELSE NULL
            END phc_type
    FROM wd_status t1
            JOIN facechecker_status t2
                ON t1.user_id = t2.user_id
    WHERE statuses = 'Approved'
    AND status = 'STATUS_DECLINED'
""")

df.head()

Unnamed: 0,user_id,state,transport_name,previous_state,statuses,event_date_utc_dttm,create_date_utc_dttm,user_id_1,type,status,modified,phc_type
0,5259256,Approved,Intercity3,Submitted,Approved,2024-02-01 13:36:36+00:00,2024-02-01 13:13:50+00:00,5259256,TYPE_AUTO,STATUS_DECLINED,2024-09-19 17:47:32.386000+00:00,1
1,7328143,New,Car courier,Approved,Approved,2024-03-31 12:02:37+00:00,2024-03-31 12:02:37+00:00,7328143,TYPE_AUTO,STATUS_DECLINED,2024-03-23 15:34:59+00:00,1
2,9482635,New,Intercity3,Approved,Approved,2024-05-01 23:27:33+00:00,2024-05-01 23:27:33+00:00,9482635,TYPE_AUTO,STATUS_DECLINED,2024-06-02 08:23:34+00:00,1
3,10374300,New,Car,Approved,Approved,2025-01-06 07:52:48.190000+00:00,2025-01-06 07:52:48+00:00,10374300,TYPE_AUTO,STATUS_DECLINED,2024-09-26 15:09:43.318000+00:00,1
4,22939971,New,Car courier,Approved,Approved,2024-05-08 11:28:07+00:00,2024-05-08 11:28:07+00:00,22939971,TYPE_AUTO,STATUS_DECLINED,2025-01-01 14:22:48.283000+00:00,1


In [19]:
splitted = np.array_split(df[['user_id', 'type', 'phc_type']], math.ceil(len(df[['user_id', 'type', 'phc_type']])/10000))
total_df = pd.DataFrame()

for index in range(len(splitted)):

    total_df = pd.concat([total_df, splitted[index].reset_index(drop=True)], axis=1)


writing_excel('PHC_consistency', total_df)

DataFrame is written to Excel File successfully.
