In [13]:
# Cell 1: Setup
"""
# Data Exploration with BigQuery

This notebook helps you:
1. Explore available datasets and tables
2. Understand table schemas
3. Preview data samples
4. Identify data quality issues
"""

# Load connector
%run utils/bigquery_connector.ipynb

# Initialize connector
bq = BigQueryConnector()

✓ BigQueryConnector class and helper functions loaded!
✓ Ready to use: bq = BigQueryConnector()
✓ Available helper functions: display_query_results, save_query_result, quick_data_profile
✓ Utility functions: setup_environment_check, create_sample_queries, format_sql_query

🎯 METHOD 2: Filter only custom methods
Custom methods in BigQueryConnector:
   1. __init__
   2. count_rows
   3. explore_table_by_index
   4. export_table_to_csv
   5. extract_table_data
   6. get_column_stats
   7. get_table_info
   8. get_table_schema
   9. get_tables
  10. list_datasets
  11. query_data
  12. quick_query
  13. search_tables
  14. select_table_by_index
  15. select_table_by_number
  16. test_connection
  17. validate_query
✓ Credentials loaded from: C:\Users\prada\Documents\Latihan\Project_Github\connect-to-gcs\config\credentials\asikngulik-analytics-cb7cf888258f.json
✓ BigQuery client initialized for project: asikngulik-analytics


In [None]:
# Cell 2: Explore datasets
print("🗂️ Exploring available datasets...")
datasets = bq.list_datasets()

In [11]:
# Cell 3: Explore tables in current dataset
if bq.dataset_id:
    print(f"📋 Exploring tables in dataset: {bq.dataset_id}")
    tables = bq.get_tables()
    
    # Create interactive table selector
    if tables:
        print("\n🎯 Select a table to explore:")
        for i, table in enumerate(tables, 1):
            print(f"  {i}. {table}")


📋 Exploring tables in dataset: thelook
📊 Dataset: thelook
📋 Available tables (9):
   1. distribution_centers
   2. events
   3. inventory_items
   4. lookerstudio_report_distribution_centers
   5. lookerstudio_report_profit
   6. order_items
   7. orders
   8. products
   9. users

🎯 Select a table to explore:
  1. distribution_centers
  2. events
  3. inventory_items
  4. lookerstudio_report_distribution_centers
  5. lookerstudio_report_profit
  6. order_items
  7. orders
  8. products
  9. users


In [12]:
bq.get_table_info('distribution_centers')


📊 TABLE INFORMATION: distribution_centers
📈 Total Rows: 10
💾 Size: 0.00 MB
📅 Created: 2025-06-06 15:20:17.384000+00:00
🔄 Modified: 2025-06-06 15:20:17.384000+00:00

📋 SCHEMA (5 columns):
No.  Column Name               Data Type       Mode       Description                   
------------------------------------------------------------------------------------------
1    id                        INTEGER         NULLABLE                                 
2    name                      STRING          NULLABLE                                 
3    longitude                 FLOAT           NULLABLE                                 
4    latitude                  FLOAT           NULLABLE                                 
5    distribution_center_geom  GEOGRAPHY       NULLABLE                                 

🔍 SAMPLE DATA (first 5 rows):
🔄 Executing query...
✓ Query executed successfully
📊 Rows returned: 5
📋 Columns: 5
💾 Memory usage: 0.00 MB


Unnamed: 0,id,name,longitude,latitude,distribution_center_geom
0,1,Memphis TN,-89.9711,35.1174,POINT(-89.9711 35.1174)
1,5,New Orleans LA,-90.0667,29.95,POINT(-90.0667 29.95)
2,6,Port Authority of New York/New Jersey NY/NJ,-73.7834,40.634,POINT(-73.7834 40.634)
3,8,Mobile AL,-88.0431,30.6944,POINT(-88.0431 30.6944)
4,10,Savannah GA,-81.1167,32.0167,POINT(-81.1167 32.0167)


Table(TableReference(DatasetReference('asikngulik-analytics', 'thelook'), 'distribution_centers'))

In [14]:
tables

['distribution_centers',
 'events',
 'inventory_items',
 'lookerstudio_report_distribution_centers',
 'lookerstudio_report_profit',
 'order_items',
 'orders',
 'products',
 'users']

In [17]:
df_event = bq.extract_table_data('events')
df_event.head()

🔄 Extracting data from table: events
🔄 Executing query...
✓ Query executed successfully
📊 Rows returned: 2,434,263
📋 Columns: 13
💾 Memory usage: 1373.43 MB
✅ Successfully extracted 2,434,263 rows from events


Unnamed: 0,id,user_id,sequence_number,session_id,created_at,ip_address,city,state,postal_code,browser,traffic_source,uri,event_type
0,242523,18564.0,6,24cf771e-aa76-4d04-865c-eabc42cc86cf,2022-03-02 08:55:17+00:00,78.26.63.34,Bogatynia,Dolnośląskie,59,Safari,Adwords,/cart,cart
1,988394,75592.0,6,71c6b58f-bcae-4fd4-a25d-2bc291de7e10,2019-11-19 04:59:43+00:00,193.53.213.162,Bogatynia,Dolnośląskie,59,Safari,Email,/cart,cart
2,1470472,,3,da64b684-7913-467c-af4a-86008c319663,2021-06-30 05:26:00+00:00,174.248.233.220,Bogatynia,Dolnośląskie,59,Chrome,Email,/cart,cart
3,852269,65210.0,6,f11ccafd-e17e-4b32-991a-cceb9ed43f3a,2021-02-09 10:32:10+00:00,116.64.203.47,Bogatynia,Dolnośląskie,59,Firefox,Email,/cart,cart
4,1520965,,3,a3ab1265-d29f-428f-84da-92dbd773005b,2020-10-07 07:57:00+00:00,74.46.42.136,Bogatynia,Dolnośląskie,59,Safari,Email,/cart,cart


In [18]:
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2434263 entries, 0 to 2434262
Data columns (total 13 columns):
 #   Column           Dtype              
---  ------           -----              
 0   id               Int64              
 1   user_id          Int64              
 2   sequence_number  Int64              
 3   session_id       object             
 4   created_at       datetime64[us, UTC]
 5   ip_address       object             
 6   city             object             
 7   state            object             
 8   postal_code      object             
 9   browser          object             
 10  traffic_source   object             
 11  uri              object             
 12  event_type       object             
dtypes: Int64(3), datetime64[us, UTC](1), object(9)
memory usage: 248.4+ MB


In [21]:
df_event.groupby('city')['city'].count().sort_values(ascending=False).head(10)

city
Shanghai    61958
Beijing     51099
Seoul       35050
Shenzhen    30641
null        23594
Dongguan    22029
New York    20796
Tokyo       20762
Chengdu     20207
Foshan      19162
Name: city, dtype: int64