# Packages

#### Libraries

In [2]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def cycle_sql(start, end, query, weeks=False):
    """
    You have to use {date} in your script to add cycle date into this backets
    """
    date_start = datetime.strptime(start, '%Y-%m-%d')
    date_end = datetime.strptime(end, '%Y-%m-%d')

    if weeks == False:
        daterange = [(date_start + timedelta(days=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days)+1)]
    else:
        daterange = [(date_start + timedelta(weeks=x)).strftime('%Y-%m-%d') for x in range(((date_end-date_start).days//7)+1)] # weeks dividing days by 7

    total_df = pd.DataFrame()

    counter = 0

    for date in daterange:
        counter+=1
        print(f"{counter}) Uploading - {date}:", datetime.today().strftime('%Y-%m-%d %H:%M:%S'))
        script = query.format(date = date)
        df_cycle = bigquery_client.query(script).to_dataframe()
        if df_cycle.empty == True:
            print('Dataframe is empty')
        total_df = pd.concat([df_cycle, total_df])
    return total_df

def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

#### Functions

In [4]:
def get_connections(df, max_level=3, col_name='connections'):

    G = nx.Graph()
    edges = list(zip(df["user_id_from"], df["user_id_to"]))
    G.add_edges_from(edges)

    connections_data = []

    for user in tqdm(G.nodes):
        user_connections = defaultdict(set)

        levels = {user: 0}
        queue = [(user, 0)]
        visited = set([user])

        while queue:
            current, level = queue.pop(0)

            if level >= max_level:
                continue

            for neighbor in G.neighbors(current):
                if neighbor not in visited:
                    visited.add(neighbor)
                    levels[neighbor] = level + 1
                    queue.append((neighbor, level + 1))
                    user_connections[level + 1].add(neighbor)

        sorted_connections = tuple(sorted(user_connections[lvl]) for lvl in sorted(user_connections))
        connections_data.append((user, sorted_connections))

    return pd.DataFrame(connections_data, columns=["user_id_from", col_name])

def level_depth(row):
    return len(row)

def number_connections(row):
    connections = 0
    for i in row:
        connections += len(i)

    return connections

def number_of_1st_connections(row):
        if len(row) > 0:
            return len(row[0])
        else:
            return 0 

def number_of_2nd_connections(row):
        if len(row) > 1:
            return len(row[1])
        else:
            return 0  

def number_of_3rd_connections(row):
        if len(row) > 2:
            return len(row[2])
        else:
            return 0 

# Checking


In [5]:
df = read_bq("""
SELECT name,
       event_dt_part,
       country_name,
       os_name,
       COUNT(DISTINCT user_id) AS users,
FROM (SELECT user_id,
             name,
             os_name,
             event_dt_part,
             TIMESTAMP_MILLIS(client_time)                            AS client_time,
             t1.city_id,
             t2.city_name,
             t2.country_id,
             t2.country_name,
             IF(LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                    OVER (PARTITION BY user_id ORDER BY client_time) IS NULL,
                JSON_EXTRACT_SCALAR(payload, '$.verification_flow'),
                LAG(JSON_EXTRACT_SCALAR(payload, '$.verification_flow'))
                    OVER (PARTITION BY user_id ORDER BY client_time)) AS fulfilled_flow,
             JSON_EXTRACT_SCALAR(payload, '$.status')                 AS status
      FROM indriver-e6e40.ods_event_tracker.event t1
               JOIN indriver-e6e40.heap.vw_macroregion_mapping t2
                    ON
                        t1.city_id = t2.city_id
      WHERE 1 = 1
        AND os_name IN ('ios', 'android')
        AND name IN (
                     'client.verification_start.show',
                     'client.verification_flow_result_status.show',
                     'client.verification_start.click',
                     'client.sumsub_verification_provider_start.show',
                     'client.verification_flow_get_status.done'
          )
        AND event_dt_part BETWEEN '2025-03-10' AND '2025-04-08')
GROUP BY 1, 2, 3, 4
  """)

In [7]:
df.groupby(['name', 'event_dt_part', 'country_name', 'os_name'], as_index=False)['users'].sum()

Unnamed: 0,name,event_dt_part,country_name,os_name,users
0,client.sumsub_verification_provider_start.show,2025-03-10,Argentina,android,166
1,client.sumsub_verification_provider_start.show,2025-03-10,Argentina,ios,89
2,client.sumsub_verification_provider_start.show,2025-03-10,Bolivia,android,4
3,client.sumsub_verification_provider_start.show,2025-03-10,Bolivia,ios,3
4,client.sumsub_verification_provider_start.show,2025-03-10,Brazil,android,6270
...,...,...,...,...,...
5080,client.verification_start.show,2025-04-08,Peru,android,3352
5081,client.verification_start.show,2025-04-08,Peru,ios,709
5082,client.verification_start.show,2025-04-08,South Africa,android,4092
5083,client.verification_start.show,2025-04-08,South Africa,ios,917


In [11]:
df["event_dt_part"] = pd.to_datetime(df["event_dt_part"])

In [12]:
df["Week"] = df["event_dt_part"].dt.to_period("W").dt.to_timestamp()

In [14]:
for country in df['country_name'].unique():
    for os in df['os_name'].unique():
        fig = px.line(
            df.groupby(['name', 'event_dt_part', 'country_name', 'os_name'], as_index=False)['users'].sum().query(f"country_name == '{country}' and os_name == '{os}'"), 
            x="event_dt_part", 
            y="users", 
            color='name',
            title=f'OS = {os} and Country = {country}')
        fig.show()

SyntaxError: Python keyword not valid identifier in numexpr query (<unknown>, line 1)

In [None]:
fig = px.line(df, x="year", y="lifeExp", title='Life expectancy in Canada')
fig.show()