In [101]:
# Visualisation
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network

# Data analysis / Data processing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', None)
pd.options.display.float_format = "{:,.2f}".format
from datetime import time, timedelta, datetime
import numpy as np
import networkx as nx
from collections import defaultdict
import ast

# Maths & Stats
import math 
import scipy.stats as st
from scipy import stats
from scipy.stats import norm
import statsmodels.stats.weightstats as ws
from statsmodels.stats.proportion import test_proportions_2indep
import AB_library

# System library
import os
import ipywidgets
import warnings
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm
tqdm.pandas()
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
%config InlineBackend.figure_format='retina'
# from itables import init_notebook_mode
# init_notebook_mode(all_interactive=True)
import openpyxl

# Data connection
from google.cloud import bigquery
bigquery_client = bigquery.Client(project='analytics-dev-333113')


# Useful functions
def read_bq(query, project='analytics-dev-333113'):
    client = bigquery.Client(project=project)
    query_job = client.query(query)
    result_df = query_job.to_dataframe()
    return result_df

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(
        html_str.replace('table','table style="display:inline"'), 
        raw=True
    )

def writing_excel(name:str, dataset1=None, dataset2=None, dataset3=None, dataset4=None):
    with pd.ExcelWriter(f"{name}.xlsx") as writer:

    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet

        if dataset1 is not None:
            if dataset2 is not None:
                if dataset3 is not None:
                    if dataset4 is not None:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                        dataset4.to_excel(writer, sheet_name=f"4-{name}", 
                                        #   index=False
                                            )
                    else:
                        dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                        #   index=False
                                            )
                        dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                        #   index=False
                                            )
                        dataset3.to_excel(writer, sheet_name=f"3-{name}", 
                                        #   index=False
                                            )
                else:
                    dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                    #   index=False
                                        )
                    dataset2.to_excel(writer, sheet_name=f"2-{name}", 
                                    #   index=False
                                        )
            else:
                dataset1.to_excel(writer, sheet_name=f"1-{name}", 
                                #   index=False
                                    )

        print('DataFrame is written to Excel File successfully.')

In [22]:
df = read_bq("""
    WITH expel_ban AS (SELECT user_id,
                            start_dttm,
                            end_dttm,
                            module,
                            type_cd,
                            autoban_id,
                            unban_applied_cd,
                            1 AS has_ban
                    FROM dwh-storage-327422.ods_expel.tbl_ban
                    WHERE 1 = 1
                        AND module IN ('all', 'city', 'courier')
                    QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY end_dttm DESC) = 1),
        user_tbl AS (SELECT id,
                            COALESCE(banned, 0) AS user_service_banned_flg,
                            mode
                    FROM dwh-storage-327422.personal_data.tbl_user_act),
        compiled AS (SELECT t1.id,
                            t1.user_service_banned_flg,
                            CASE
                                WHEN (DATE(end_dttm) < CURRENT_DATE()) OR (unban_applied_cd = 1) THEN 0
                                WHEN (DATE(end_dttm) > DATE_ADD(CURRENT_DATE(), INTERVAL + 3 DAY) AND unban_applied_cd = 0)
                                    THEN 1
                                WHEN end_dttm IS NULL THEN 0
                                ELSE NULL
                                END ban_expel_status_flg,
                            CASE
                                WHEN (DATE(end_dttm) < CURRENT_DATE()) OR (unban_applied_cd = 1) THEN 'ban_is_expired'
                                WHEN (DATE(end_dttm) > DATE_ADD(CURRENT_DATE(), INTERVAL + 3 DAY) AND unban_applied_cd = 0)
                                    THEN 'ban_is_active'
                                WHEN end_dttm IS NULL THEN 'was_no_ban'
                                ELSE NULL
                                END ban_expel_status,
                            CASE
                                WHEN DATE_DIFF(DATE(end_dttm), DATE(start_dttm), YEAR) >= 10 THEN 'long_term_ban'
                                WHEN DATE_DIFF(DATE(end_dttm), DATE(start_dttm), YEAR) < 10 THEN 'short_term_ban'
                                ELSE NULL
                                END period_of_ban,
                            t1.mode,
                            t2.start_dttm,
                            t2.end_dttm,
                            t2.module,
                            t2.type_cd,
                            t2.autoban_id,
                            t2.unban_applied_cd
                    FROM user_tbl t1
                            LEFT JOIN expel_ban t2
                                        ON t1.id = t2.user_id),
        gmv AS (SELECT user_id,
                        metric_date_utc,
                        SUM(rides_count)     AS rides_count,
                        SUM(gmv_clean_usd)   AS gmv_clean_usd,
                        SUM(gmv_unclean_usd) AS unclean_gmv_usd
                FROM indriver-bi.incity.tbl_incity_growth_metrics_detail
                GROUP BY 1, 2)
    SELECT *
    FROM compiled t1
            LEFT JOIN gmv t2
                    ON t1.id = t2.user_id
                        AND t2.metric_date_utc >= DATE(start_dttm)
    WHERE user_service_banned_flg != ban_expel_status_flg
""")

In [24]:
df.groupby(['user_service_banned_flg', 'mode', 'ban_expel_status_flg', 'ban_expel_status', 'period_of_ban'], as_index=False)[['rides_count', 'gmv_clean_usd']].sum()

Unnamed: 0,user_service_banned_flg,mode,ban_expel_status_flg,ban_expel_status,period_of_ban,rides_count,gmv_clean_usd
0,0,client,1,ban_is_active,long_term_ban,8443,26450.17
1,0,client,1,ban_is_active,short_term_ban,65,205.33
2,0,driver,1,ban_is_active,long_term_ban,24023,63873.06
3,0,driver,1,ban_is_active,short_term_ban,74,177.03
4,1,client,0,ban_is_expired,long_term_ban,176,738.12
5,1,client,0,ban_is_expired,short_term_ban,1345,4213.7
6,1,driver,0,ban_is_expired,long_term_ban,853,1731.79
7,1,driver,0,ban_is_expired,short_term_ban,5668,15431.2


In [68]:
df_grouping = read_bq("""
    WITH expel_ban AS (SELECT user_id,
                            start_dttm,
                            end_dttm,
                            module,
                            type_cd,
                            autoban_id,
                            unban_applied_cd,
                            1 AS has_ban
                    FROM dwh-storage-327422.ods_expel.tbl_ban
                    WHERE 1 = 1
                        AND module IN ('all', 'city', 'courier')
                        AND DATE(start_dttm) <= DATE_ADD(CURRENT_DATE(), INTERVAL - 2 DAY)
                    QUALIFY ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY end_dttm DESC) = 1),
        user_tbl AS (SELECT id,
                            COALESCE(banned, 0) AS user_service_banned_flg,
                            mode
                    FROM dwh-storage-327422.personal_data.tbl_user_act),
        compiled AS (SELECT t1.id,
                            t1.user_service_banned_flg,
                            CASE
                                WHEN (DATE(end_dttm) < CURRENT_DATE()) OR (unban_applied_cd = 1) THEN 0
                                WHEN (DATE(end_dttm) > DATE_ADD(CURRENT_DATE(), INTERVAL + 3 DAY) AND unban_applied_cd = 0)
                                    THEN 1
                                WHEN end_dttm IS NULL THEN 0
                                ELSE NULL
                                END ban_expel_status_flg,
                            CASE
                                WHEN (DATE(end_dttm) < CURRENT_DATE()) OR (unban_applied_cd = 1) THEN 'ban_is_expired'
                                WHEN (DATE(end_dttm) > DATE_ADD(CURRENT_DATE(), INTERVAL + 3 DAY) AND unban_applied_cd = 0)
                                    THEN 'ban_is_active'
                                WHEN end_dttm IS NULL THEN 'was_no_ban'
                                ELSE NULL
                                END ban_expel_status,
                            CASE
                                WHEN DATE_DIFF(DATE(end_dttm), DATE(start_dttm), YEAR) >= 10 THEN 'long_term_ban'
                                WHEN DATE_DIFF(DATE(end_dttm), DATE(start_dttm), YEAR) < 10 THEN 'short_term_ban'
                                ELSE NULL
                                END period_of_ban,
                            t1.mode,
                            t2.start_dttm,
                            t2.end_dttm,
                            t2.module,
                            t2.type_cd,
                            t2.autoban_id,
                            t2.unban_applied_cd
                    FROM user_tbl t1
                            LEFT JOIN expel_ban t2
                                        ON t1.id = t2.user_id)
    SELECT *
    FROM compiled
    WHERE user_service_banned_flg != ban_expel_status_flg
""")

df_grouping['module'] = df_grouping['module'].fillna('was_no_ban')
df_grouping.head()

In [70]:
df_grouping

Unnamed: 0,id,user_service_banned_flg,ban_expel_status_flg,ban_expel_status,period_of_ban,mode,start_dttm,end_dttm,module,type_cd,autoban_id,unban_applied_cd
0,12945886,0,1,ban_is_active,long_term_ban,client,2018-11-07 14:34:39+00:00,2118-11-08 14:34:39+00:00,all,0,0,0
1,12971555,0,1,ban_is_active,long_term_ban,client,2018-11-07 15:05:30+00:00,2118-11-08 15:05:30+00:00,all,0,0,0
2,6332286,0,1,ban_is_active,long_term_ban,driver,2019-08-29 14:36:47+00:00,2119-08-29 14:36:47+00:00,all,5,0,0
3,2687484,1,0,was_no_ban,,client,,,was_no_ban,,,
4,5865615,1,0,was_no_ban,,client,,,was_no_ban,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
215514,25447293,0,1,ban_is_active,long_term_ban,client,2019-08-22 15:44:30+00:00,3019-08-22 15:44:30+00:00,all,2,0,0
215515,25461759,0,1,ban_is_active,long_term_ban,client,2019-08-08 10:05:09+00:00,2119-08-08 10:05:15+00:00,all,2,0,0
215516,25453234,0,1,ban_is_active,long_term_ban,client,2019-08-05 22:56:23+00:00,3019-08-06 22:56:23+00:00,all,3,0,0
215517,25430769,0,1,ban_is_active,long_term_ban,client,2019-08-23 10:56:39+00:00,3019-08-24 10:56:39+00:00,all,3,0,0


In [125]:
len(df_grouping[df_grouping['module'] == f'all'])

150784

In [119]:
modules = ['was_no_ban', 'all', 'courier', 'city']

for category in modules:

    new_df = pd.DataFrame()

    number_of_group = math.ceil(len(df_grouping[df_grouping['module'] == f'{category}'])/5000)
    splitted = np.array_split(df_grouping[df_grouping['module'] == f'{category}'][['id', 'module', 'ban_expel_status']], number_of_group)

    for number in range(len(splitted)):
        new_df = pd.concat([new_df, splitted[number].reset_index(drop=True)], axis=1)


    writing_excel(f'{category}', new_df)

    

DataFrame is written to Excel File successfully.
DataFrame is written to Excel File successfully.
DataFrame is written to Excel File successfully.
DataFrame is written to Excel File successfully.
