# Packages

In [1]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.4f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
# from ambrosia.designer import Designer
# from ambrosia.tester import Tester
import expab
from sklearn.linear_model import Ridge
import random
from sklearn.metrics import mean_absolute_error, mean_squared_error


# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):

    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df  

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

# Pulling the data

In [9]:
df = read_bq("""
WITH photos AS (SELECT t1.user_id,
                       t1.photo_uuid,
                       IF(EXTRACT(HOUR FROM DATETIME(t1.created_at, tz.timezone)) IN
                          (21, 22, 23, 0, 1, 2, 3, 4, 5, 6), 1,
                          0) AS night_time_flg,
                       t3.macroregion_name
                FROM indriver-e6e40.ods_facechecker.user_liveness t1
                         JOIN dwh-storage-327422.personal_data.tbl_user_act t2
                              ON t1.user_id = t2.id
                         JOIN indriver-e6e40.ods_monolith.tbl_city tz ON t2.city_id = tz.id
                         JOIN indriver-e6e40.heap.vw_macroregion_mapping t3
                              ON
                                  t2.city_id = t3.city_id
                WHERE created_at >= '2025-03-01'
                  AND t2.country_id IN (12, 23, 54, 25, 13, 43, 24, 75, 72, 77, 11, 22, 10)
                  AND photo_uuid IS NOT NULL
                QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) = 1),
     avatars AS (SELECT av.user_id,
                        tcv.url_first
                 FROM indriver-e6e40.inappropriate_content.ds_avatar_check av
                          JOIN dwh-storage-327422.personal_data.taskcv tcv ON tcv.task_id = CAST(av.task_id AS string)
                 WHERE tcv.export_dt >= '2025-03-01'
                   AND result_code = 'RESULT_AVATAR_CHECK_FACE_OK')
SELECT t1.user_id,
       t1.macroregion_name,
       t1.night_time_flg,
       t1.photo_uuid AS liveness_picture,
       t3.url_first  AS avatar_picture
FROM photos t1
         LEFT JOIN avatars t3 ON t1.user_id = t3.user_id
WHERE (t1.photo_uuid IS NOT NULL AND t3.url_first IS NOT NULL)
""")

df.head()

Unnamed: 0,user_id,macroregion_name,night_time_flg,liveness_picture,avatar_picture
0,10894660,Latin America,0,0196e4b8-a9ec-7c81-81d9-46571000bec2,https://file-storage-front.eu-east-1.indriverapp.com/api/v1/files/0196d63a-321a-7032-bd3e-ca5483f7275a?resize=800x600
1,13022962,Latin America,0,01974b1c-e9f2-735f-a2c7-3fceba2f5314,https://file-storage-front.eu-east-1.indriverapp.com/api/v1/files/01974b19-cc69-7384-8a67-9da0b86c1d19?resize=800x600
2,13536252,Latin America,0,019674bf-ffb2-7067-8508-91d4c9ef0135,https://file-storage-front.eu-east-1.indriverapp.com/api/v1/files/0195d413-06c0-76b7-9e3c-649290421ff5?resize=800x600
3,13570013,Latin America,0,019756f3-84b3-7aae-89ca-1988ae0f334d,https://file-storage-front.eu-east-1.indriverapp.com/api/v1/files/01956c85-6438-7134-8139-0d384957cb4a?resize=800x600
4,13702862,Latin America,0,019712df-b058-7acf-ab37-aad022570cf0,https://file-storage-front.eu-east-1.indriverapp.com/api/v1/files/01970a0e-56bf-7bd3-837b-5cda5778a9a9?resize=800x600


In [3]:
def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.') 

In [17]:
latam_0 = df[(df['macroregion_name'].isin(['Latin America', 'Brazil']))&(df['night_time_flg']==0)].sample(2000)
latam_1 = df[(df['macroregion_name'].isin(['Latin America', 'Brazil']))&(df['night_time_flg']==1)].sample(2000)
africa_0 = df[(df['macroregion_name']=='Africa')&(df['night_time_flg']==0)].sample(2000)
africa_1 = df[(df['macroregion_name']=='Africa')&(df['night_time_flg']==1)].sample(2000)

In [18]:
writing_excel('Avatar_Liveness_ML_comparison', 
              latam_0,
              latam_1,
              africa_0,
              africa_1
              )

DataFrame is written to Excel File successfully.
