In [7]:
import pandas as pd
from datetime import datetime as dt

### Store CSV into DataFrame

In [8]:
csv_file = "../Resources/gas_prices.csv"
gas_prices_df = pd.read_csv(csv_file)
gas_prices_df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Type of fuel,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1990-01,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,58.3,,,,1
1,1990-02,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,58.9,,,,1
2,1990-03,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,59.5,,,,1
3,1990-04,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,60.3,,,,1
4,1990-05,"Québec, Quebec",2011S0503421,Regular unleaded gasoline at self service fill...,Cents per litre,57,units,0,v735095,6.2,60.4,,,,1


### Create new data with select columns

In [14]:
new_gas_prices_df = gas_prices_df[['REF_DATE', 'GEO', 'VALUE']].copy()
new_gas_prices_df.head()

Unnamed: 0,REF_DATE,GEO,VALUE
0,1990-01,"Québec, Quebec",58.3
1,1990-02,"Québec, Quebec",58.9
2,1990-03,"Québec, Quebec",59.5
3,1990-04,"Québec, Quebec",60.3
4,1990-05,"Québec, Quebec",60.4


In [15]:
new_gas_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   REF_DATE  3348 non-null   object 
 1   GEO       3348 non-null   object 
 2   VALUE     3348 non-null   float64
dtypes: float64(1), object(2)
memory usage: 78.6+ KB


In [16]:
new_gas_prices_df['REF_DATE'] = pd.to_datetime(new_gas_prices_df['REF_DATE'])

In [17]:
new_gas_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   REF_DATE  3348 non-null   datetime64[ns]
 1   GEO       3348 non-null   object        
 2   VALUE     3348 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 78.6+ KB


In [23]:
new_gas_prices_df[['City','Province']]=new_gas_prices_df['GEO'].str.split(',', 1,expand=True)

In [21]:
new_gas_prices_df.head()

Unnamed: 0,REF_DATE,GEO,VALUE,City,Province
0,1990-01-01,"Québec, Quebec",58.3,Québec,Quebec
1,1990-02-01,"Québec, Quebec",58.9,Québec,Quebec
2,1990-03-01,"Québec, Quebec",59.5,Québec,Quebec
3,1990-04-01,"Québec, Quebec",60.3,Québec,Quebec
4,1990-05-01,"Québec, Quebec",60.4,Québec,Quebec


In [24]:
new_gas_prices_df['Year'] = pd.DatetimeIndex(new_gas_prices_df['REF_DATE']).year

In [26]:
new_gas_prices_df

Unnamed: 0,REF_DATE,GEO,VALUE,City,Province,Year
0,1990-01-01,"Québec, Quebec",58.3,Québec,Quebec,1990
1,1990-02-01,"Québec, Quebec",58.9,Québec,Quebec,1990
2,1990-03-01,"Québec, Quebec",59.5,Québec,Quebec,1990
3,1990-04-01,"Québec, Quebec",60.3,Québec,Quebec,1990
4,1990-05-01,"Québec, Quebec",60.4,Québec,Quebec,1990
...,...,...,...,...,...,...
3343,2020-08-01,"Victoria, British Columbia",127.9,Victoria,British Columbia,2020
3344,2020-09-01,"Victoria, British Columbia",125.0,Victoria,British Columbia,2020
3345,2020-10-01,"Victoria, British Columbia",121.6,Victoria,British Columbia,2020
3346,2020-11-01,"Victoria, British Columbia",119.9,Victoria,British Columbia,2020


In [27]:
drop_gas_prices_df = new_gas_prices_df.drop(labels=['REF_DATE','GEO','City'], axis=1)

In [28]:
drop_gas_prices_df

Unnamed: 0,VALUE,Province,Year
0,58.3,Quebec,1990
1,58.9,Quebec,1990
2,59.5,Quebec,1990
3,60.3,Quebec,1990
4,60.4,Quebec,1990
...,...,...,...
3343,127.9,British Columbia,2020
3344,125.0,British Columbia,2020
3345,121.6,British Columbia,2020
3346,119.9,British Columbia,2020


In [29]:
clean_gas_prices_df = drop_gas_prices_df.rename(columns={'VALUE':'Price'})

In [31]:
clean_gas_prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3348 entries, 0 to 3347
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Price     3348 non-null   float64
 1   Province  3348 non-null   object 
 2   Year      3348 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 78.6+ KB


In [32]:
#Change order of columns
column_names = ['Year','Province','Price']
clean_gas_prices_df=clean_gas_prices_df.reindex(columns=column_names)

In [33]:
clean_gas_prices_df

Unnamed: 0,Year,Province,Price
0,1990,Quebec,58.3
1,1990,Quebec,58.9
2,1990,Quebec,59.5
3,1990,Quebec,60.3
4,1990,Quebec,60.4
...,...,...,...
3343,2020,British Columbia,127.9
3344,2020,British Columbia,125.0
3345,2020,British Columbia,121.6
3346,2020,British Columbia,119.9


In [None]:
#Create Visualizations for this