# Project SQL

## Pick a dataset that interests you (or multiple data sets)

Use the Open Data Sets available from Google BigQuery. You can use your own Google account or Kaggle.



# Imports

In [1]:
%%capture
# pandas module
import pandas as pd
import pandas_gbq
import matplotlib.pyplot as plt

# gcp modules
from google.cloud import bigquery

# colab modules
from google.colab import auth
from google.colab import syntax
from google.colab import userdata

# Authentication

In [2]:
auth.authenticate_user()
print('Authenticated')

Authenticated


# Enable Data Table Entry

In [3]:
%load_ext google.colab.data_table
# %unload_ext google.colab.data_table

# List Projects

In [4]:
%alias gcloud gcloud

In [5]:
%%capture
gcloud projects list --sort-by=projectId

In [6]:
project_id = userdata.get('proj-3')

# Questions for this dataset
1. How many rows are in the full dataset?
2. How many columns are in the full dataset?
3. What tables are available in the google_political_ads dataset?
4. What columns are available in the google_political_ads dataset?
5. Which advertisers have spent the most money on political ads?
6. How much have specific advertisers (Biden and Trump campaigns) spent on political ads?
7. What is the total spending of Biden and Trump campaigns combined?
8. What is the average spending of Biden and Trump campaigns?
9. How has spending changed over time for a specific advertiser?


### Independant Python Aliases

In [None]:
client = bigquery.Client(project=project_id)


In [7]:
THREE_GB = 3 * 1_000 * 1_000 * 1_000 * 1_000
SIX_GB = 6 * 1_000 * 1_000 * 1_000 * 1_000
TEN_GB = 10 * 1_000 * 1_000 * 1_000 * 1_000


### Basic Queries


In [63]:
row_count = client.query('''
  SELECT
    COUNT(1) as total
  FROM `bigquery-public-data.google_political_ads.creative_stats`
  '''
).to_dataframe()["total"][0]

print(f'Full dataset has {row_count:_} rows')

Full dataset has 1_814_735 rows


In [64]:
column_count = client.query('''
  SELECT
    COUNT(1) as total
  FROM `bigquery-public-data.google_political_ads.creative_stats`
   ''').to_dataframe()["total"][0]
print(f'Full dataset has {column_count:_} columns')

Full dataset has 1_814_735 columns


In [55]:
schema = client.query('''
  SELECT table_name
  FROM `bigquery-public-data`.google_political_ads.INFORMATION_SCHEMA.TABLES
''').to_dataframe()
schema

Unnamed: 0,table_name
0,geo_spend
1,advertiser_stats
2,advertiser_weekly_spend
3,advertiser_declared_stats
4,last_updated
5,advertiser_geo_spend
6,campaign_targeting
7,top_keywords_history
8,creative_stats


# advstats = advertiser_stats table

In [57]:
# what are the sum and average of spending by advertisers
advstats = client.query('''
SELECT
    as_.advertiser_id,
    as_.advertiser_name,
    as_.regions,
    SUM(cs.spend_range_min_usd) AS total_spend,
    AVG(cs.spend_range_min_usd) AS avg_spend
  FROM
    `bigquery-public-data.google_political_ads.advertiser_stats` AS as_
    INNER JOIN `bigquery-public-data.google_political_ads.creative_stats` AS cs ON as_.advertiser_id = cs.advertiser_id
  GROUP BY
    as_.advertiser_id, as_.advertiser_name, as_.regions
  ORDER BY total_spend DESC
''').to_dataframe()

In [58]:
advstats

Unnamed: 0,advertiser_id,advertiser_name,regions,total_spend,avg_spend
0,AR12365610929977556993,BIDEN FOR PRESIDENT,US,89154100,1416.763603
1,AR11825178974693097473,"DONALD J. TRUMP FOR PRESIDENT, INC.",US,76283700,2791.411739
2,AR09516581413973917697,MIKE BLOOMBERG 2020 INC,US,54533600,850.240883
3,AR10462168114010259457,HARRIS VICTORY FUND,US,54434300,26398.787585
4,AR07331491169769095169,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,US,42654800,992.479873
...,...,...,...,...,...
19799,AR08216568412034301953,ΟΥΡΑΝΙΑ ΘΡΑΣΚΙΑ,GR,0,0.000000
19800,AR17208005755274264577,OSVALDO PIMENTA DE ALMEIDA JUNIOR,BR,0,0.000000
19801,AR17744657933179813889,RUSS RYWELL CAMPAIGN,US,0,0.000000
19802,AR06908152804796268545,STATE OF RI/ DEPT OF STATE/ CORPORATION DIVISION,US,0,0.000000


# cstats =creative_stats table

In [31]:
# What data types are the rows in the table?
cstats = client.query('''
  SELECT column_name, data_type
  FROM `bigquery-public-data`.google_political_ads.INFORMATION_SCHEMA.COLUMNS
  WHERE table_name = 'creative_stats'
''').to_dataframe()

In [65]:
cstats

Unnamed: 0,column_name,data_type
0,ad_id,STRING
1,ad_url,STRING
2,ad_type,STRING
3,regions,STRING
4,advertiser_id,STRING
5,advertiser_name,STRING
6,ad_campaigns_list,STRING
7,date_range_start,DATE
8,date_range_end,DATE
9,num_of_days,INT64


#### SELECT (with * and with column names)
How much has each advertiser spent

#### LIMIT



### AND, FROM, WHERE, GROUP BY,


### AND


#### ORDER BY


#### GROUP BY



In [13]:
%%bigquery --project {project_id}
SELECT
    cs1.advertiser_name AS advertiser1,
    cs2.advertiser_name AS advertiser2,
    SUM(cs1.spend_range_min_usd) AS total_spend_1,
    SUM(cs2.spend_range_min_usd) AS total_spend_2
  FROM
    `bigquery-public-data.google_political_ads.creative_stats` AS cs1
    INNER JOIN `bigquery-public-data.google_political_ads.creative_stats` AS cs2 ON cs1.advertiser_name != cs2.advertiser_name
  WHERE cs1.advertiser_name IN ('BIDEN FOR PRESIDENT', 'HARRIS VICTORY FUND', 'HARRIS FOR PRESIDENT', 'BIDEN VICTORY FUND')
   AND cs2.advertiser_name IN ('DONALD J. TRUMP FOR PRESIDENT, INC', 'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE')
  GROUP BY advertiser1, advertiser2
ORDER BY total_spend_1 DESC, total_spend_2 DESC

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,advertiser1,advertiser2,total_spend_1,total_spend_2
0,BIDEN FOR PRESIDENT,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,3831664909800,2684181254400
1,HARRIS VICTORY FUND,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,2339477345400,87954197600
2,HARRIS FOR PRESIDENT,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,1661735774400,496331252800
3,BIDEN VICTORY FUND,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,1313429169000,154495685600


#### SUM


In [61]:
%%bigquery --project {project_id}
SELECT
    SUM(CASE WHEN advertiser_name IN ('BIDEN FOR PRESIDENT', 'HARRIS VICTORY FUND', 'HARRIS FOR PRESIDENT', 'BIDEN VICTORY FUND') THEN spend_range_min_usd ELSE 0 END) AS total_spend1,
    SUM(CASE WHEN advertiser_name IN ('DONALD J. TRUMP FOR PRESIDENT, INC', 'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE') THEN spend_range_min_usd ELSE 0 END) AS total_spend2
  FROM
    `bigquery-public-data.google_political_ads.creative_stats`;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_spend1,total_spend2
0,212813700,42654800


#### AVG


In [15]:
%%bigquery --project {project_id}
SELECT
    AVG(CASE WHEN advertiser_name IN ('BIDEN FOR PRESIDENT', 'HARRIS VICTORY FUND', 'HARRIS FOR PRESIDENT', 'BIDEN VICTORY FUND') THEN spend_range_min_usd ELSE 0 END) AS advertiser1_avg_spend,
    AVG(CASE WHEN advertiser_name IN ('DONALD J. TRUMP FOR PRESIDENT, INC', 'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE') THEN spend_range_min_usd ELSE 0 END) AS advertiser2_avg_spend
  FROM
    `bigquery-public-data.google_political_ads.creative_stats`;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,advertiser1_avg_spend,advertiser2_avg_spend
0,117.269849,23.504699


#### OR


#### LIKE (with % or _ wildcard)


#### BETWEEN


### Sorting and Grouping


#### DISTINCT


### Aggregates


#### MAX


#### MIN


#### COUNT



## Make some plots

Make some cool plots to go with your data. Write SQL queries to get ONLY the information you need for each plot. (Don't pull ALL the data and then just plot a few columns.)



In [16]:
advertiser_name = 'BIDEN FOR PRESIDENT' #@param {type:"string"}

df = pd.DataFrame()
df = df[df["advertiser_name"] == advertiser_name]

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['daily_spend'])
plt.xlabel('Date')
plt.ylabel('Spending (USD)')
plt.title(f'Spending Over Time for {advertiser_name}')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

KeyError: 'advertiser_name'

## EXTRA CREDIT:

#### Use a query that joins two tables.


#### Make a model to see if you can predict something


#### Come up with something else cool to do with your data
