In [1]:
from google.cloud import bigquery
bq = bigquery.Client()

q = bq.query(
"""
select count(*) as num_rows 
from bigquery-public-data.geo_us_boundaries.counties
""")
q.to_dataframe()

Unnamed: 0,num_rows
0,3233


In [2]:
q = bq.query(
"""
SELECT counties.geo_id
FROM `bigquery-public-data.geo_us_boundaries.counties` AS counties
WHERE counties.county_name = "Dane"
LIMIT 10;
""")
df = q.to_dataframe()


In [3]:
#q1
geo_id = df['geo_id'].iloc[0]
ans = str(geo_id)
ans

'55025'

In [4]:
q = bq.query(
"""
SELECT state_fips_code, COUNT(*) AS county_nums
FROM `bigquery-public-data.geo_us_boundaries.counties`
GROUP BY state_fips_code
ORDER BY county_nums DESC
LIMIT 5;
""")
df = q.to_dataframe()

In [5]:
#q2
df.set_index('state_fips_code')['county_nums'].to_dict()

{'48': 254, '13': 159, '51': 133, '21': 120, '29': 115}

In [6]:
#q3
config = bigquery.QueryJobConfig(use_query_cache=False)
q = bq.query(
"""
SELECT counties.geo_id
FROM `bigquery-public-data.geo_us_boundaries.counties` AS counties
WHERE counties.county_name = "Dane"
LIMIT 10;
"""
, job_config=config)

price_per_tb = 6.25

cost_q1 = (q.total_bytes_billed / 1024**4) * price_per_tb

q = bq.query(
"""
SELECT state_fips_code, COUNT(*) AS county_nums
FROM `bigquery-public-data.geo_us_boundaries.counties`
GROUP BY state_fips_code
ORDER BY county_nums DESC
LIMIT 5;
"""
, job_config=config)

cost_q2 = (q.total_bytes_billed / 1024**4) * price_per_tb

costs = {
    'q1': cost_q1,
    'q2': cost_q2
}

costs

{'q1': 5.9604644775390625e-05, 'q2': 5.9604644775390625e-05}

In [7]:
ds = bigquery.Dataset("cs544-399214.p8")
bq.create_dataset(ds, exists_ok=True)

Dataset(DatasetReference('cs544-399214', 'p8'))

In [8]:
config = bigquery.LoadJobConfig(source_format="PARQUET", write_disposition="WRITE_TRUNCATE")
source = "gs://haucheng_rob-p8/hdma-wi-2021.parquet"
job = bq.load_table_from_uri(source, "cs544-399214.p8.hdma", job_config=config)
job.result()

LoadJob<project=cs544-399214, location=US, id=cdb2fc37-66d2-41fd-8c5b-af8b0a4fb36f>

In [9]:
#q4
[ds.dataset_id for ds in bq.list_datasets("cs544-399214")]

['p8']

In [10]:
q = bq.query(
"""
SELECT c.county_name, COUNT(*) as nums
FROM `cs544-399214.p8.hdma` h
JOIN `bigquery-public-data.geo_us_boundaries.counties` c ON h.county_code = c.county_fips_code
GROUP BY c.county_name
ORDER BY nums DESC
LIMIT 10;
""")
df = q.to_dataframe()

In [11]:
#q5
df.set_index('county_name')['nums'].to_dict()

{'Milwaukee': 46570,
 'Dane': 38557,
 'Waukesha': 34159,
 'Brown': 15615,
 'Racine': 13007,
 'Outagamie': 11523,
 'Kenosha': 10744,
 'Washington': 10726,
 'Rock': 9834,
 'Winnebago': 9310}

In [12]:
url = "https://docs.google.com/spreadsheets/d/11UeIBqQylAyNUBsIO54p6WiYJWHayQMfHDbUWq1jGco/"

external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
external_config.source_uris = [url]
external_config.options.skip_leading_rows = 1
external_config.autodetect = True

table = bigquery.Table("cs544-399214.p8.applications")
table.external_data_configuration = external_config

table = bq.create_table(table, exists_ok=True)

In [13]:
q = bq.query(
"""
SELECT COUNT(*) AS nums
FROM `cs544-399214.p8.applications`
WHERE income = 100000; 
"""
) # set income as 100K

df = q.to_dataframe()

In [14]:
#q6
ans = df['nums'].iloc[0]
ans

19

In [15]:
q = bq.query(
"""
WITH Applications AS (
  SELECT ST_GEOGPOINT(longitude, latitude) as point
  FROM `cs544-399214.p8.applications`
)
SELECT c.county_name, COUNT(*) as application_count
FROM Applications a
JOIN `bigquery-public-data.geo_us_boundaries.counties` c
ON ST_CONTAINS(c.county_geom, a.point)
WHERE c.state_fips_code = '55'
GROUP BY c.county_name;
"""
)
df = q.to_dataframe()

In [16]:
#q7
df.set_index('county_name')['application_count'].to_dict()

{'Outagamie': 1,
 'Dane': 47,
 'Milwaukee': 3,
 'Bayfield': 4,
 'Door': 15,
 'Columbia': 4,
 'Crawford': 1,
 'Walworth': 2,
 'Ashland': 1,
 'Manitowoc': 1,
 'Sheboygan': 3,
 'Brown': 6,
 'Marinette': 2,
 'Oneida': 3,
 'Douglas': 1,
 'Winnebago': 2,
 'Wood': 1,
 'Dodge': 1,
 'Marathon': 1,
 'Polk': 1,
 'Eau Claire': 1,
 'Monroe': 2,
 'Barron': 2,
 'Kewaunee': 1,
 'Ozaukee': 1,
 'Jefferson': 1,
 'Green Lake': 2,
 'Adams': 1,
 'Sauk': 1}

In [17]:
%load_ext google.cloud.bigquery

In [18]:
%%bigquery
CREATE OR REPLACE MODEL `cs544-399214.p8.p8_model`
OPTIONS(model_type='LINEAR_REG', INPUT_LABEL_COLS=["loan_amount"]) AS
SELECT
  loan_amount,
  income,
  loan_term
FROM
  `cs544-399214.p8.hdma`;

Query is running:   0%|          |

In [19]:
import time 

while True:
    models = list(bq.list_models("cs544-399214.p8"))
    # print(models)
    model_names = [model.model_id for model in models]
    if 'p8_model' in model_names:
        break
    time.sleep(5)

In [20]:
%%bigquery df
SELECT *
FROM ML.EVALUATE(MODEL `cs544-399214.p8.p8_model`, (
    SELECT loan_amount, income, loan_term
    FROM `cs544-399214.p8.hdma`
))

Query is running:   0%|          |

Downloading:   0%|          |

In [21]:
#q8
df["r2_score"].iloc[0]

0.2916534692239263

In [22]:
%%bigquery df
SELECT *
FROM ML.WEIGHTS(MODEL `cs544-399214.p8.p8_model`)

Query is running:   0%|          |

Downloading:   0%|          |

In [23]:
df

Unnamed: 0,processed_input,weight,category_weights
0,income,0.804631,[]
1,loan_term,380.225284,[]
2,__INTERCEPT__,-579.734123,[]


In [24]:
#q9
df["weight"].iloc[0]

0.80463102539899

In [25]:
%%bigquery df
WITH Pred AS (
SELECT *
FROM ML.PREDICT(MODEL `cs544-399214.p8.p8_model`, (
    SELECT loan_amount, income, 360 AS loan_term 
    FROM `cs544-399214.p8.applications`
))
)
SELECT
  COUNTIF(loan_amount > predicted_loan_amount) / COUNT(*) as ratio
FROM
  Pred;

Query is running:   0%|          |

Downloading:   0%|          |

In [26]:
#q10
df["ratio"].iloc[0]

0.6141304347826086