In [1]:
#Import Dependencies 
import pandas as pd
from datetime import datetime as dt

### Store CSV into DataFrame
### Extract Gas Prices Data

In [2]:
csv_file = "Resources/gas_prices.csv"
orig_gas_data_df = pd.read_csv(csv_file)
orig_gas_data_df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Type of fuel,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1990-01,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,58.3,,,,1
1,1990-02,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,58.9,,,,1
2,1990-03,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,59.5,,,,1
3,1990-04,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,60.3,,,,1
4,1990-05,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,60.4,,,,1


### Create new data with select columns

In [3]:
refined1_gas_data_df = orig_gas_data_df[['REF_DATE', 'GEO', 'VALUE']].copy()
refined1_gas_data_df.head()

Unnamed: 0,REF_DATE,GEO,VALUE
0,1990-01,"Québec, Quebec",58.3
1,1990-02,"Québec, Quebec",58.9
2,1990-03,"Québec, Quebec",59.5
3,1990-04,"Québec, Quebec",60.3
4,1990-05,"Québec, Quebec",60.4


### Clean DataFrame

In [4]:
refined1_gas_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   REF_DATE  3348 non-null   object 
 1   GEO       3348 non-null   object 
 2   VALUE     3348 non-null   float64
dtypes: float64(1), object(2)
memory usage: 78.6+ KB


In [5]:
#Converting REF_DATE to datetime
refined1_gas_data_df['REF_DATE'] = pd.to_datetime(refined1_gas_data_df['REF_DATE'])

In [6]:
#Check result
refined1_gas_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   REF_DATE  3348 non-null   datetime64[ns]
 1   GEO       3348 non-null   object        
 2   VALUE     3348 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 78.6+ KB


In [7]:
#Getting province column by splitting GEO column
refined1_gas_data_df[['City','Province']]=refined1_gas_data_df['GEO'].str.split(', ', 1,expand=True)

In [8]:
#Check result
refined1_gas_data_df.head()

Unnamed: 0,REF_DATE,GEO,VALUE,City,Province
0,1990-01-01,"Québec, Quebec",58.3,Québec,Quebec
1,1990-02-01,"Québec, Quebec",58.9,Québec,Quebec
2,1990-03-01,"Québec, Quebec",59.5,Québec,Quebec
3,1990-04-01,"Québec, Quebec",60.3,Québec,Quebec
4,1990-05-01,"Québec, Quebec",60.4,Québec,Quebec


In [9]:
#Getting 'Year' value from REF_DATE
refined1_gas_data_df['Year'] = pd.DatetimeIndex(refined1_gas_data_df['REF_DATE']).year


In [10]:
#Check result
refined1_gas_data_df.head()

Unnamed: 0,REF_DATE,GEO,VALUE,City,Province,Year
0,1990-01-01,"Québec, Quebec",58.3,Québec,Quebec,1990
1,1990-02-01,"Québec, Quebec",58.9,Québec,Quebec,1990
2,1990-03-01,"Québec, Quebec",59.5,Québec,Quebec,1990
3,1990-04-01,"Québec, Quebec",60.3,Québec,Quebec,1990
4,1990-05-01,"Québec, Quebec",60.4,Québec,Quebec,1990


In [11]:
#Dropping unnecessary columns
refined2_gas_data_df = refined1_gas_data_df.drop(labels=['REF_DATE','GEO','City'], axis=1)


In [12]:
#Check result
refined2_gas_data_df.head()

Unnamed: 0,VALUE,Province,Year
0,58.3,Quebec,1990
1,58.9,Quebec,1990
2,59.5,Quebec,1990
3,60.3,Quebec,1990
4,60.4,Quebec,1990


In [13]:
#Change VALUE column name to Price
renamed_col_gas_data_df=refined2_gas_data_df.rename(columns={'VALUE':'Price'})


In [14]:
#Check result
renamed_col_gas_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Price     3348 non-null   float64
 1   Province  3348 non-null   object 
 2   Year      3348 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 78.6+ KB


In [15]:
#Change order of columns
column_names = ['Year','Province','Price']
gas_prices_df=renamed_col_gas_data_df.reindex(columns=column_names)

In [16]:
#Check result
gas_prices_df.head()

Unnamed: 0,Year,Province,Price
0,1990,Quebec,58.3
1,1990,Quebec,58.9
2,1990,Quebec,59.5
3,1990,Quebec,60.3
4,1990,Quebec,60.4


In [17]:
#Removing ambiguous data
clean_gas_prices_df = gas_prices_df[~gas_prices_df.Province.str.contains("Ontario part, Ontario/Quebec")]


In [18]:
#Check result
clean_gas_prices_df.head()

Unnamed: 0,Year,Province,Price
0,1990,Quebec,58.3
1,1990,Quebec,58.9
2,1990,Quebec,59.5
3,1990,Quebec,60.3
4,1990,Quebec,60.4


In [19]:
#Group data by Year and calculate average price per year
groupby_year_data_df = clean_gas_prices_df[['Year','Province','Price']].groupby(['Year','Province'], as_index=False).agg('mean')


In [20]:
#Check result
groupby_year_data_df.tail()

Unnamed: 0,Year,Province,Price
119,2019,Quebec,122.504167
120,2020,Alberta,92.125
121,2020,British Columbia,124.483333
122,2020,Ontario,106.291667
123,2020,Quebec,104.908333


In [21]:
print(len(groupby_year_data_df))

124


### Load Clean Gas Prices Data Locally

In [22]:
#Export file as a CSV, without the Pandas index, but with the header
groupby_year_data_df.to_csv("Data/clean_gas_prices.csv", index=False, header=True)

### Extract Gas Emissions Data

In [23]:
csv2_file = "Resources/gas_emissions.csv"
orig_em_data_df = pd.read_csv(csv2_file)
orig_em_data_df.head()

Unnamed: 0,Entity,Code,Year,Annual CO2 emissions
0,Afghanistan,AFG,1949,14656
1,Afghanistan,AFG,1950,84272
2,Afghanistan,AFG,1951,91600
3,Afghanistan,AFG,1952,91600
4,Afghanistan,AFG,1953,106256


In [24]:
orig_em_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23949 entries, 0 to 23948
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Entity                23949 non-null  object
 1   Code                  21299 non-null  object
 2   Year                  23949 non-null  int64 
 3   Annual CO2 emissions  23949 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 748.5+ KB


In [25]:
#Filter for Canada and years from 1990 and above
canada_em_data_df = orig_em_data_df.loc[(orig_em_data_df["Entity"]=="Canada") & (orig_em_data_df["Year"] >= 1990)]
canada_em_data_df.head()


Unnamed: 0,Entity,Code,Year,Annual CO2 emissions
4065,Canada,CAN,1990,458007408
4066,Canada,CAN,1991,449699173
4067,Canada,CAN,1992,463521037
4068,Canada,CAN,1993,463993303
4069,Canada,CAN,1994,478267071


In [26]:
#Reset the index
canada_em_data_df.reset_index(inplace=True,drop=True)
canada_em_data_df.head()

Unnamed: 0,Entity,Code,Year,Annual CO2 emissions
0,Canada,CAN,1990,458007408
1,Canada,CAN,1991,449699173
2,Canada,CAN,1992,463521037
3,Canada,CAN,1993,463993303
4,Canada,CAN,1994,478267071


In [27]:
#Check result
canada_em_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Entity                31 non-null     object
 1   Code                  31 non-null     object
 2   Year                  31 non-null     int64 
 3   Annual CO2 emissions  31 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 1.1+ KB


In [28]:
#Dropped unnecessary columns
drop_col_em_data_df = canada_em_data_df.drop(labels=["Code"], axis=1)

In [29]:
#Check result
drop_col_em_data_df.info

<bound method DataFrame.info of     Entity  Year  Annual CO2 emissions
0   Canada  1990             458007408
1   Canada  1991             449699173
2   Canada  1992             463521037
3   Canada  1993             463993303
4   Canada  1994             478267071
5   Canada  1995             490951380
6   Canada  1996             506999620
7   Canada  1997             521278446
8   Canada  1998             528987104
9   Canada  1999             543729999
10  Canada  2000             566552294
11  Canada  2001             559042165
12  Canada  2002             564669174
13  Canada  2003             582343586
14  Canada  2004             580758769
15  Canada  2005             575852411
16  Canada  2006             571409763
17  Canada  2007             594665366
18  Canada  2008             579367139
19  Canada  2009             546610441
20  Canada  2010             558804007
21  Canada  2011             569870198
22  Canada  2012             569262731
23  Canada  2013             573

In [30]:
#Renamed columns
renamed_canada_em_data_df=drop_col_em_data_df.rename(columns={"Annual CO2 emissions":"CO2","Entity":"Country"})
renamed_canada_em_data_df

Unnamed: 0,Country,Year,CO2
0,Canada,1990,458007408
1,Canada,1991,449699173
2,Canada,1992,463521037
3,Canada,1993,463993303
4,Canada,1994,478267071
5,Canada,1995,490951380
6,Canada,1996,506999620
7,Canada,1997,521278446
8,Canada,1998,528987104
9,Canada,1999,543729999


In [31]:
#Convert CO2 values from Integer to Float
renamed_canada_em_data_df = renamed_canada_em_data_df.astype({"CO2": float})
renamed_canada_em_data_df

Unnamed: 0,Country,Year,CO2
0,Canada,1990,458007408.0
1,Canada,1991,449699173.0
2,Canada,1992,463521037.0
3,Canada,1993,463993303.0
4,Canada,1994,478267071.0
5,Canada,1995,490951380.0
6,Canada,1996,506999620.0
7,Canada,1997,521278446.0
8,Canada,1998,528987104.0
9,Canada,1999,543729999.0


In [32]:
#CO2 column values converted from Billion Tonnes to Mega Tonnes
renamed_canada_em_data_df["CO2"]=renamed_canada_em_data_df["CO2"].div(1000000)
renamed_canada_em_data_df

Unnamed: 0,Country,Year,CO2
0,Canada,1990,458.007408
1,Canada,1991,449.699173
2,Canada,1992,463.521037
3,Canada,1993,463.993303
4,Canada,1994,478.267071
5,Canada,1995,490.95138
6,Canada,1996,506.99962
7,Canada,1997,521.278446
8,Canada,1998,528.987104
9,Canada,1999,543.729999


### Load Gas Emissions Data Locally

In [33]:
#Export file as a CSV, without the Pandas index, but with the header
renamed_canada_em_data_df.to_csv("Data/clean_emission_data.csv", index=False, header=True)

In [34]:
#Checking the length of each dataframe
print(len(renamed_canada_em_data_df))
print(len(groupby_year_data_df))

31
124
