# Google Analytics Website Reports
[References](https://janakiev.com/blog/python-google-analytics/)

## Environment settings

In [1]:
# import Libraries
import numpy as np
import pandas as pd
import pandas_gbq
import gspread
import polars as pl
import duckdb
import altair as alt
import json
from datawrapper import Datawrapper
from google.cloud import bigquery
from oauth2client.service_account import ServiceAccountCredentials
from google.oauth2 import service_account
from googleapiclient.discovery import build
import warnings
warnings.filterwarnings("ignore")
import pytz
import datetime
# set Mexico City datatime
tz = pytz.timezone('America/Mexico_City')
update = datetime.datetime.now(tz).strftime('%Y-%m-%d %H:%M')
period = f'Last update: {update}'
# set polars column str length
pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [2]:
# Token gotten from datawrapper api
filename = "../APIs/credentials.json"
# read json file
with open(filename) as f:
    keys = json.load(f)
# read credentials
token = keys["datawrapper_api"]
# Acces datawrapper
dw = Datawrapper(access_token = token)
# Check if the account is correctly initialized
#dw.account_info() 

In [3]:
# create function to send data to gsheets
def save_to_gsheets(df, sheet_name, worksheet_name, period):
    client = gspread.service_account(api)
    sheet = client.open(sheet_name)
    worksheet = sheet.worksheet(worksheet_name)
    
    # freeze rows starting from
    worksheet.freeze(4)

    # clear text from column A4 to J
    worksheet.batch_clear(['A4:J'])

    # convert column datetime type to string
    for column in df.columns[df.dtypes == 'datetime64[ns]']:
        df[column] = df[column].astype(str)

    # replace NaN values by empty strings
    worksheet.update('A4', [df.columns.values.tolist()] + df.fillna('').values.tolist())

    # set update date
    worksheet.update('J2',period)

    print(f'DataFrame uploaded to: {sheet_name}, {worksheet_name}')

## Collect data from Google Analytics

In [4]:
# google analytics ids
view_id = '219175238'
# GA api
api = '../APIs/gepp-538-db.json'
# google analytics, drive and sheets
scopes = ['https://www.googleapis.com/auth/spreadsheets',
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/analytics.readonly']
# connect to google sheets
gs_credentials = ServiceAccountCredentials.from_json_keyfile_name(api, scopes)
gc = gspread.authorize(gs_credentials)
# connect to big query
bq_credentials = service_account.Credentials.from_service_account_file(api)
project_id = 'gepp-538'
client = bigquery.Client(credentials=bq_credentials,project=project_id)
# connect to analytics service
service = build('analyticsreporting','v4',credentials=gs_credentials)

In [5]:
# collect data from google analytics
def create_df(response):

    #create two empty lists that will hold our dimentions and sessions data
    row_list = []
  
    #Extract Data
    for report in response.get('reports', []):
        column_header = report.get('columnHeader', {})
        dimension_headers = column_header.get('dimensions', [])
        metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])
  
        for row in report.get('data', {}).get('rows', []):
            row_dict = {}
            dimensions = row.get('dimensions', [])
            date_range_values = row.get('metrics', [])
  
            for header, dimension in zip(dimension_headers, dimensions):
                row_dict[header] = dimension
  
            for i, values in enumerate(date_range_values):
                for metric, value in zip(metric_headers, values.get('values')):
                    if ',' in value or '.' in value:
                        row_dict[metric.get('name')] = float(value)
                    else:
                        row_dict[metric.get('name')] = int(value)
            row_list.append(row_dict)
    return pl.DataFrame(row_list)

## General figures
* Date
* Users
* New Users
* Sessions
* Bounce Rate
* Pageviews, etc

### Google Analytics parameters

In [6]:
# add start and end dates
inicio = '2022-01-01'
fin = 'yesterday'
# add query parameters
dimension = 'ga:date'
metric1 = 'ga:users'
metric2 = 'ga:newUsers'
metric3 = 'ga:sessions'
metric4 = 'ga:bounceRate'
metric5 = 'ga:pageviews'
metric6 = 'ga:pageviewsPerSession'
metric7 = 'ga:avgSessionDuration'
metric8 = 'ga:avgPageLoadTime'

### Google Analytics query

In [7]:
# execute query
response = service.reports().batchGet(
body = {
        'reportRequests': [
            {
                'viewId': view_id,
                'dateRanges': [{'startDate': inicio, 'endDate': fin}],
                            'metrics': [
                                        {'expression': metric1},
                                        {'expression': metric2},
                                        {'expression': metric3},
                                        {'expression': metric4},
                                        {'expression': metric5},
                                        {'expression': metric6},
                                        {'expression': metric7},
                                        {'expression': metric8},],
                'dimensions': [{"name": dimension}],
            }]
    }
).execute()

metrics1 = create_df(response)

### Data transformation with polars

In [8]:
# change column names by dropping 'ga:' prefix
metrics1.columns = [col.replace('ga:','') for col in metrics1.columns]
# convert column types
metrics1 = metrics1.with_columns(
    [
        pl.col('date').str.strptime(pl.Date, format='%Y%m%d', strict=False).cast(pl.Date),
        pl.col('users').cast(pl.Int64),
        pl.col('newUsers').cast(pl.Int64),
        pl.col('sessions').cast(pl.Int64),
        pl.col('bounceRate').cast(pl.Float64),
        pl.col('pageviews').cast(pl.Int64),
        pl.col('pageviewsPerSession').cast(pl.Float64),
        pl.col('avgSessionDuration').cast(pl.Float64),
        pl.col('avgPageLoadTime').cast(pl.Float64),
    ]
)

In [9]:
metrics1.head()

date,users,newUsers,sessions,bounceRate,pageviews,pageviewsPerSession,avgSessionDuration,avgPageLoadTime
date,i64,i64,i64,f64,i64,f64,f64,f64
2022-01-01,1265,53,1382,33.212735,7013,5.07453,122.863242,2.466
2022-01-02,1482,66,1627,28.149969,9928,6.102028,141.514444,2.728795
2022-01-03,3261,254,3878,17.741104,29660,7.648272,217.521661,2.649454
2022-01-04,3342,226,3949,19.16941,27197,6.88706,193.655356,2.95732
2022-01-05,3245,261,3844,18.132154,26633,6.92846,216.087669,2.284005


### Data SQL query with DuckDB

In [10]:
# create sql query
report1 = duckdb.sql(
'''
SELECT DATE_PART('year', date) AS Year
    , DATE_PART('month', date) AS Month
    , SUM(users) AS Users
    , SUM(newUsers) AS 'New users'
    , SUM(sessions) AS Sessions
    , AVG(bounceRate) AS 'Bounce rate'
    , SUM(pageViews) AS 'Pageviews'
    , AVG(pageviewsPerSession) AS 'Pageviews per session'
    , AVG(avgSessionDuration) AS 'Avg session duration (mins)'
    , AVG(avgPageLoadTime) AS 'Avg page load (mins)'
FROM metrics1
WHERE year >= 2022
GROUP BY Year, Month
ORDER BY Year, Month
'''
).pl()
report1

Year,Month,Users,New users,Sessions,Bounce rate,Pageviews,Pageviews per session,Avg session duration (mins),Avg page load (mins)
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64
2022,1,90324.0,7543.0,106250.0,21.507553,697512.0,6.391275,192.994366,3.090498
2022,2,113932.0,10062.0,136482.0,19.485489,936521.0,6.740715,210.71825,2.53074
2022,3,155208.0,12378.0,188577.0,19.111657,1303699.0,6.768296,216.795018,2.459314
2022,4,152008.0,9307.0,184806.0,19.479063,1227636.0,6.481416,213.753228,2.308335
2022,5,160358.0,10214.0,194245.0,17.950407,1391101.0,7.003548,223.612496,2.390385
2022,6,194637.0,13845.0,238861.0,16.495787,1783551.0,7.29177,229.162277,2.462701
2022,7,194945.0,12890.0,239903.0,5.4698,1705437.0,6.924726,222.942912,2.561448
2022,8,182811.0,11689.0,220242.0,4.485293,1532759.0,6.825749,214.403027,2.874907
2022,9,182468.0,11534.0,218254.0,8.11832,1381836.0,6.200656,201.541811,2.917417
2022,10,186433.0,12629.0,222226.0,10.175147,1386061.0,6.059682,198.799056,2.784551


### Send data to Google Sheets Report

In [15]:
# convert to pandas dataframe from polars
report1 = report1.to_pandas()
# send pandas dataframe to google sheets
save_to_gsheets(report1, 'GA Monthly Report', 'figures', period)

DataFrame uploaded to: GA Monthly Report, figures


### Google Sheets Report visualization

In [16]:
%%html
<iframe src="https://docs.google.com/spreadsheets/d/1u0YESJSPl3jSDp46-54ug1FaMrGu7Xwosk7CrACaWyA/edit?usp=sharing" width="880" height="880"></iframe>

### Report Visualization with Datawrapper

In [17]:
# Create chart in Datawrapper
report_ga = dw.create_chart(
    title = "Monthly Report from Google Analytics, 2022-2023",
    chart_type = 'tables',
    data = report1
    )
# Publish in Datawrapper
dw.publish_chart(report_ga['id'], display=False)
# Get iframe code to embed e.g. in Notion
dw.get_iframe_code(report_ga['id'])
print('Done!')

Done!


In [19]:
dw.get_iframe_code(report_ga['id'])

'<iframe title="Monthly Report from Google Analytics, 2022-2023" aria-label="Table" id="datawrapper-chart-qxpAY" src="https://datawrapper.dwcdn.net/qxpAY/1/" scrolling="no" frameborder="0" style="border: none;" width="600" height="400" data-external="1"></iframe>'

In [22]:
%%html
<iframe src="https://datawrapper.dwcdn.net/qxpAY/1/" width="880" height="550"></iframe>

## Quantities and Revenue

In [13]:
# execute query
response = service.reports().batchGet(
body={
        'reportRequests': [
            {
                'viewId': view_id,
                'dateRanges': [{'startDate': inicio, 'endDate': fin}],
                            'metrics': [
                                        {'expression': 'ga:productRevenuePerPurchase'},
                                        {'expression':'ga:uniquePurchases'},
                                        {'expression':'ga:itemQuantity'},
                                        {'expression':'ga:revenuePerItem'},
                                        {'expression':'ga:itemsPerPurchase'},
                                        {'expression':'ga:itemRevenue'},
                                        ],
                'dimensions': [{"name": 'ga:productSku'}],
            }]
    }
).execute()

products = create_df(response)

In [14]:
# drop ga: prefix from column names
products.columns = [col.replace('ga:','') for col in products.columns]
# display top 5 products by highest revenue
products.sort('productRevenuePerPurchase', descending=True).head(5)

productSku,productRevenuePerPurchase,uniquePurchases,itemQuantity,revenuePerItem,itemsPerPurchase,itemRevenue
str,f64,i64,i64,f64,f64,f64
"""3292""",1021.944318,88,90,999.234444,1.022727,89931.1
"""6796""",649.466667,15,34,286.529412,2.266667,9742.0
"""3187""",442.9,10,10,442.9,1.0,4429.0
"""8036""",398.05042,119,157,301.707006,1.319328,47368.0
"""2655""",395.335565,336,345,385.022464,1.026786,132832.75


In [16]:
products = products.sort('productRevenuePerPurchase', descending=True).head(10).to_pandas()
alt.Chart(
    products, 
    width=700, 
    height=350, 
    title='Top 10 most profitable products in website',
).mark_bar().encode(
    x = alt.X('productRevenuePerPurchase:Q'),
    y = alt.Y('productSku:N', sort='-x'),
    color = alt.value('#967117'),
    tooltip = alt.Tooltip(['productSku', 'productRevenuePerPurchase',], format=',.2f')
).configure_title(fontSize=24)

## Transactions and revenue

In [17]:
response = service.reports().batchGet(body={
    'reportRequests': [{
        'viewId': view_id,
        'dateRanges': [{'startDate': inicio, 'endDate': fin}],
        'metrics': [
            {"expression": "ga:transactions"},
            #{"expression": ",ga:transactionsPerSession"}, #ecommerce conversion rate
            {"expression": "ga:transactionRevenue"},
            {"expression": "ga:revenuePerTransaction"}, # avg order value
            {"expression": "ga:itemQuantity"},
        ], "dimensions": [
            {'name':'ga:date'},
        ]
    }
    ]
}).execute()

trans = create_df(response)

In [18]:
trans.sample(5)

ga:date,ga:transactions,ga:transactionRevenue,ga:revenuePerTransaction,ga:itemQuantity
str,i64,f64,f64,i64
"""20230325""",1135,732437.61,645.31948,7093
"""20230204""",643,404482.83,629.055723,3503
"""20221001""",837,609012.75,727.613799,5095
"""20220209""",641,226036.87,352.631622,2322
"""20230323""",1492,956517.57,641.097567,8691


In [19]:
# drop ga: string from column names
trans.columns = [col.replace('ga:','') for col in trans.columns]
# convert column to date
trans = trans.with_columns(
    pl.col('date').str.strptime(pl.Date, format='%Y%m%d').cast(pl.Date)
)

In [21]:
# convert to pandas dataframe from polars
trans = trans.to_pandas()
# create line chart
alt.Chart(
    trans,
    width=800, 
    height=400,
    title='Website transactions over time',
).mark_line().encode(
    x = 'date',
    y = 'transactions',
    color = alt.value('#703642'),
    tooltip = ['date', 'transactions',]
).configure_title(fontSize=24)

## Geographic map and new users

In [22]:
response = service.reports().batchGet(body={
    'reportRequests': [{
        'viewId': view_id,
        'dateRanges': [{'startDate': inicio, 'endDate': fin}],
        'metrics': [
            {"expression": "ga:users"},
            {"expression": "ga:newUsers"},
        ], "dimensions": [
            {"name": "ga:country"},
            {"name": "ga:region"},
            {"name": "ga:city"},            
            {"name": "ga:longitude"},
            {"name": "ga:latitude"}
        ], "samplingLevel": "LARGE",
    }]
}).execute()

geo = create_df(response)
geo.sample(5)

ga:country,ga:region,ga:city,ga:longitude,ga:latitude,ga:users,ga:newUsers
str,str,str,str,str,i64,i64
"""Mexico""","""Puebla""","""764044""","""0.0000""","""0.0000""",26,10
"""Mexico""","""Guanajuato""","""San Jose Iturbide""","""-100.3841""","""21.0033""",23,3
"""Guatemala""","""(not set)""","""Mazatenango""","""-91.5063""","""14.5316""",1,0
"""Mexico""","""State of Mexico""","""Tejupilco de Hidalgo""","""-100.1507""","""18.9062""",57,21
"""Mexico""","""Nuevo Leon""","""Santiago""","""-100.1591""","""25.4166""",131,27


In [23]:
# drop ga: string from column names
geo.columns = [col.replace('ga:','') for col in geo.columns]
# convert column types to decimal
geo = geo.with_columns(
    pl.col(['longitude', 'latitude']).cast(pl.Float64)
# filter by non empty country column
).filter(
    (pl.col('country')!='(not set)')
    & (pl.col('region')!='(not set)')
    & (pl.col('city')!='(not set)')
)

In [238]:
#geo = geo.select(['longitude','latitude','users','newUsers']).to_pandas()
# Create chart in Datawrapper
map_ga = dw.create_chart(
    title = "Map of users from Google Analytics, 2022-2023",
    chart_type = 'd3-maps-symbols',
    data = geo
    )
# Publish in Datawrapper
dw.publish_chart(map_ga['id'], display=False)
# Get iframe code to embed e.g. in Notion
dw.get_iframe_code(map_ga['id'])
print('Done!')

Done!


In [24]:
%%html
<iframe src=https://datawrapper.dwcdn.net/PuO1p/1/ width="1200" height="620"></iframe>

## Contact

<!-- Avatar -->
<img src="../Pictures/profile2.png" alt="me" width="75" height="80">
<!-- Text with color, font, fontsize and specific size -->
<p style="color:#323232; font-family: Helevetica; font-size: 20px;">Jesus L. Monroy<br>Economist | Data Scientist</p>
<!-- Insert url links in logos -->
<!-- Telegram -->
<a href="https://t.me/j3suslm" target="_blank" rel="noreferrer"> <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Telegram_X_2019_Logo.svg/2048px-Telegram_X_2019_Logo.png?size=16&color=3b3b3b" alt="telegram" width="30" height="22" style="padding-left:8px"/>
<!-- Twitter -->
<a href="https://www.twitter.com/sqlalchemist" target="_blank" rel="noreferrer"> <img src="https://toppng.com/public/uploads/preview/twitter-x-new-logo-round-icon-png-11692480241tdbz6jparr.webp?size=16&color=3b3b3b" alt="twitter" width="30" height="22" style="padding-left:8px"/>
<!-- Github -->
<a href="https://github.com/SqlAlchemist/My-portfolio" target="_blank" rel="noreferrer"> <img src="https://icongr.am/devicon/github-original.svg?size=16&color=3b3b3b" alt="github" width="30" height="30" style="padding-left:8px"/>
<!-- Linkedin -->
<a href="https://www.linkedin.com/in/j3sus-lmonroy" target="_blank" rel="noreferrer"> <img src="https://icongr.am/simple/linkedin.svg?size=16&color=3b3b3b" alt="linkedin" width="30" height="30" style="padding-left:8px"/>
<!-- Medium -->
<a href="https://medium.com/@jesus_lmonroy" target="_blank" rel="noreferrer"> <img src="https://cdn1.iconfinder.com/data/icons/social-media-and-logos-12/32/Logo_medium-512.png?size=55&color=3b3b3b" alt="medium" width="30" height="33" style="padding-left:8px"/>