# Project SQL

## Pick a dataset that interests you (or multiple data sets)

Use the Open Data Sets available from Google BigQuery. You can use your own Google account or Kaggle.



# Imports

In [None]:
%%capture
# pandas module
import pandas as pd
import pandas_gbq
import matplotlib.pyplot as plt

# gcp modules
from google.cloud import bigquery

# colab modules
from google.colab import auth
from google.colab import syntax
from google.colab import userdata

# Authentication

In [None]:
auth.authenticate_user()
print('Authenticated')

Authenticated


# Enable Data Table Entry

In [None]:
%load_ext google.colab.data_table
# %unload_ext google.colab.data_table

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


# List Projects

In [None]:
%alias gcloud gcloud

In [None]:
%%capture
gcloud projects list --sort-by=projectId

In [None]:
project_id = userdata.get('proj-3')

## Come up with questions about your data
* What sort of information is in this dataset?
* How many records are there?
* Have the number of bitcoin transactions increased year over year?
* Does New Mexico get more or less rain now than 20 years ago?
* How many different countries (states, counties, cities, etc) have records in this data set?




## Use SQL queries to pull specific information

Do NOT pull all the data and then filter using DataFrame methods etc. Make sure and use AT LEAST 13 of the 15 SQL options listed below. (You may have to get creative and come up with more questions to ask/answer.)

Limitation Aliases

In [None]:
THREE_GB = 3 * 1_000 * 1_000 * 1_000 * 1_000
SIX_GB = 6 * 1_000 * 1_000 * 1_000 * 1_000
TEN_GB = 10 * 1_000 * 1_000 * 1_000 * 1_000


### Basic Queries


In [None]:
client = bigquery.Client(project=project_id)

row_count = client.query('''
  SELECT
    COUNT(1) as total
  FROM `bigquery-public-data.google_political_ads.creative_stats`
  '''
).to_dataframe()["total"][0]

print(f'Full dataset has {row_count:_} rows')

Full dataset has 1_812_920 rows


In [None]:
column_count = client.query('''
     SELECT
       count(*) as total
     FROM
       `bigquery-public-data`.google_political_ads.INFORMATION_SCHEMA.COLUMNS
     WHERE table_name = 'creative_stats'
   ''').to_dataframe()["total"][0]
print(f'Full dataset has {column_count:_} columns')

Full dataset has 56 columns


In [None]:
%%bigquery --project {project_id}
SELECT table_name
FROM `bigquery-public-data`.google_political_ads.INFORMATION_SCHEMA.TABLES

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,table_name
0,geo_spend
1,advertiser_stats
2,advertiser_weekly_spend
3,advertiser_declared_stats
4,last_updated
5,advertiser_geo_spend
6,campaign_targeting
7,top_keywords_history
8,creative_stats


In [None]:
%%bigquery --project {project_id}
SELECT table_name, column_name, data_type
FROM `bigquery-public-data`.google_political_ads.INFORMATION_SCHEMA.COLUMNS;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,table_name,column_name,data_type
0,geo_spend,country,STRING
1,geo_spend,country_subdivision_primary,STRING
2,geo_spend,country_subdivision_secondary,STRING
3,geo_spend,spend_usd,INT64
4,geo_spend,spend_eur,INT64
...,...,...,...
182,creative_stats,spend_range_max_ars,INT64
183,creative_stats,spend_range_min_zar,INT64
184,creative_stats,spend_range_max_zar,INT64
185,creative_stats,spend_range_min_clp,INT64


#### SELECT (with * and with column names)
How much has each advertiser spent

#### LIMIT



In [None]:
%%bigquery --project {project_id}
SELECT advertiser_name, SUM(spend_range_min_usd) AS total_spend
FROM `bigquery-public-data`.google_political_ads.creative_stats
GROUP BY advertiser_name
ORDER BY total_spend DESC
LIMIT 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,advertiser_name,total_spend
0,BIDEN FOR PRESIDENT,89154100
1,"DONALD J. TRUMP FOR PRESIDENT, INC.",76283700
2,MIKE BLOOMBERG 2020 INC,54533600
3,HARRIS VICTORY FUND,54272900
4,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,42654800
5,FF PAC,41066200
6,HARRIS FOR PRESIDENT,38047000
7,SENATE LEADERSHIP FUND,36529100
8,BIDEN VICTORY FUND,30560500
9,DSCC,27300700


### WHERE


### AND


#### ORDER BY


#### GROUP BY



In [None]:
%%bigquery --project {project_id}
SELECT
    cs1.advertiser_name AS advertiser1,
    cs2.advertiser_name AS advertiser2,
    SUM(cs1.spend_range_min_usd) AS total_spend_1,
    SUM(cs2.spend_range_min_usd) AS total_spend_2
  FROM
    `bigquery-public-data.google_political_ads.creative_stats` AS cs1
    INNER JOIN `bigquery-public-data.google_political_ads.creative_stats` AS cs2 ON cs1.advertiser_name != cs2.advertiser_name
  WHERE cs1.advertiser_name IN ('BIDEN FOR PRESIDENT', 'HARRIS VICTORY FUND', 'HARRIS FOR PRESIDENT', 'BIDEN VICTORY FUND')
   AND cs2.advertiser_name IN ('DONALD J. TRUMP FOR PRESIDENT, INC', 'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE')
  GROUP BY advertiser1, advertiser2
ORDER BY total_spend_1 DESC, total_spend_2 DESC

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,advertiser1,advertiser2,total_spend_1,total_spend_2
0,BIDEN FOR PRESIDENT,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,3831664909800,2684181254400
1,HARRIS VICTORY FUND,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,2332540696200,87612959200
2,HARRIS FOR PRESIDENT,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,1635183966000,490231616400
3,BIDEN VICTORY FUND,TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE,1313429169000,154495685600


#### OR


#### LIKE (with % or _ wildcard)


#### BETWEEN


### Sorting and Grouping


#### DISTINCT


### Aggregates


#### MAX


#### MIN


#### SUM


In [None]:
%%bigquery --project {project_id}
SELECT
    SUM(CASE WHEN advertiser_name IN ('BIDEN FOR PRESIDENT', 'HARRIS VICTORY FUND', 'HARRIS FOR PRESIDENT', 'BIDEN VICTORY FUND') THEN spend_range_min_usd ELSE 0 END) AS advertiser1_total_spend,
    SUM(CASE WHEN advertiser_name IN ('DONALD J. TRUMP FOR PRESIDENT, INC', 'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE') THEN spend_range_min_usd ELSE 0 END) AS advertiser2_total_spend
  FROM
    `bigquery-public-data.google_political_ads.creative_stats`;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,advertiser1_total_spend,advertiser2_total_spend
0,212034500,42654800


#### AVG


In [None]:
%%bigquery --project {project_id}
SELECT
    AVG(CASE WHEN advertiser_name IN ('BIDEN FOR PRESIDENT', 'HARRIS VICTORY FUND', 'HARRIS FOR PRESIDENT', 'BIDEN VICTORY FUND') THEN spend_range_min_usd ELSE 0 END) AS advertiser1_avg_spend,
    AVG(CASE WHEN advertiser_name IN ('DONALD J. TRUMP FOR PRESIDENT, INC', 'TRUMP MAKE AMERICA GREAT AGAIN COMMITTEE') THEN spend_range_min_usd ELSE 0 END) AS advertiser2_avg_spend
  FROM
    `bigquery-public-data.google_political_ads.creative_stats`;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,advertiser1_avg_spend,advertiser2_avg_spend
0,116.95745,23.528231


#### COUNT



## Make some plots

Make some cool plots to go with your data. Write SQL queries to get ONLY the information you need for each plot. (Don't pull ALL the data and then just plot a few columns.)



In [None]:
advertiser_name = 'BIDEN FOR PRESIDENT' #@param {type:"string"}

df = pd.DataFrame()
df = df[df["advertiser_name"] == advertiser_name]

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['daily_spend'])
plt.xlabel('Date')
plt.ylabel('Spending (USD)')
plt.title(f'Spending Over Time for {advertiser_name}')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

KeyError: 'advertiser_name'

## EXTRA CREDIT:

#### Use a query that joins two tables.


#### Make a model to see if you can predict something


#### Come up with something else cool to do with your data
