In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
emission_annual = pd.read_csv('../data/Source of Data - eia.gov/emission_annual.csv')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
emission_annual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43258 entries, 0 to 43257
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Year               43258 non-null  int64 
 1   State              43258 non-null  object
 2   Producer Type      43258 non-null  object
 3   Energy Source      43258 non-null  object
 4   CO2
(Metric Tons)  43258 non-null  object
 5   SO2
(Metric Tons)  43258 non-null  object
 6   NOx
(Metric Tons)  43258 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.3+ MB


In [5]:
emission_annual.head(2)

Unnamed: 0,Year,State,Producer Type,Energy Source,CO2\n(Metric Tons),SO2\n(Metric Tons),NOx\n(Metric Tons)
0,1990,AK,Commercial Cogen,All Sources,824004,13198,3011
1,1990,AK,Commercial Cogen,Coal,821929,13191,3009


## Need to convert final 3 columns to integer; in order to do so, must remove comma first
##### creating new df for each column, then concatenating the new df back into the original; once done, delete the three original columns, then remove comma and change to integer

In [6]:
test = emission_annual['CO2\n(Metric Tons)'].str.replace(',','')

In [7]:
test = pd.DataFrame(test)
test

Unnamed: 0,CO2\n(Metric Tons)
0,824004
1,821929
2,2075
3,0
4,0
...,...
43253,38255055
43254,908889
43255,0
43256,0


In [8]:
test2 = emission_annual['SO2\n(Metric Tons)'].str.replace(',','')

In [9]:
test2 = pd.DataFrame(test2)
test2

Unnamed: 0,SO2\n(Metric Tons)
0,13198
1,13191
2,6
3,149
4,149
...,...
43253,26330
43254,4
43255,6
43256,0


In [10]:
test3 = emission_annual['NOx\n(Metric Tons)'].str.replace(',','')

In [11]:
test3 = pd.DataFrame(test3)
test3

Unnamed: 0,NOx\n(Metric Tons)
0,3011
1,3009
2,2
3,42
4,42
...,...
43253,27706
43254,1875
43255,2539
43256,42


In [12]:
cols = [4,5,6]
emission_annual.drop(emission_annual.columns[cols], axis=1, inplace=True)

In [13]:
emission_annual.head(2)

Unnamed: 0,Year,State,Producer Type,Energy Source
0,1990,AK,Commercial Cogen,All Sources
1,1990,AK,Commercial Cogen,Coal


In [14]:
emission_annual = pd.concat([emission_annual,test],axis = 1)

In [15]:
emission_annual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43258 entries, 0 to 43257
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Year               43258 non-null  int64 
 1   State              43258 non-null  object
 2   Producer Type      43258 non-null  object
 3   Energy Source      43258 non-null  object
 4   CO2
(Metric Tons)  43258 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [16]:
emission_annual = pd.concat([emission_annual,test2],axis = 1)

In [17]:
emission_annual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43258 entries, 0 to 43257
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Year               43258 non-null  int64 
 1   State              43258 non-null  object
 2   Producer Type      43258 non-null  object
 3   Energy Source      43258 non-null  object
 4   CO2
(Metric Tons)  43258 non-null  object
 5   SO2
(Metric Tons)  43258 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.0+ MB


In [18]:
emission_annual = pd.concat([emission_annual,test3],axis = 1)

In [19]:
emission_annual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43258 entries, 0 to 43257
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Year               43258 non-null  int64 
 1   State              43258 non-null  object
 2   Producer Type      43258 non-null  object
 3   Energy Source      43258 non-null  object
 4   CO2
(Metric Tons)  43258 non-null  object
 5   SO2
(Metric Tons)  43258 non-null  object
 6   NOx
(Metric Tons)  43258 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.3+ MB


In [20]:
emission_annual.head(2)

Unnamed: 0,Year,State,Producer Type,Energy Source,CO2\n(Metric Tons),SO2\n(Metric Tons),NOx\n(Metric Tons)
0,1990,AK,Commercial Cogen,All Sources,824004,13198,3011
1,1990,AK,Commercial Cogen,Coal,821929,13191,3009


In [21]:
emission_annual['CO2\n(Metric Tons)'] = emission_annual['CO2\n(Metric Tons)'].astype(object).astype(float)

In [22]:
emission_annual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43258 entries, 0 to 43257
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               43258 non-null  int64  
 1   State              43258 non-null  object 
 2   Producer Type      43258 non-null  object 
 3   Energy Source      43258 non-null  object 
 4   CO2
(Metric Tons)  43258 non-null  float64
 5   SO2
(Metric Tons)  43258 non-null  object 
 6   NOx
(Metric Tons)  43258 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 2.3+ MB


In [23]:
emission_annual['CO2\n(Metric Tons)'] = emission_annual['CO2\n(Metric Tons)'].astype(object).astype(float)

In [24]:
emission_annual['CO2\n(Metric Tons)'] = emission_annual['CO2\n(Metric Tons)'].astype(object).astype(float)

In [25]:
emission_annual.head(2)

Unnamed: 0,Year,State,Producer Type,Energy Source,CO2\n(Metric Tons),SO2\n(Metric Tons),NOx\n(Metric Tons)
0,1990,AK,Commercial Cogen,All Sources,824004.0,13198,3011
1,1990,AK,Commercial Cogen,Coal,821929.0,13191,3009


In [27]:
emission_annual.tail()

Unnamed: 0,Year,State,Producer Type,Energy Source,CO2\n(Metric Tons),SO2\n(Metric Tons),NOx\n(Metric Tons)
43253,2019,WY,Total Electric Power Industry,Coal,38255055.0,26330,27706
43254,2019,WY,Total Electric Power Industry,Natural Gas,908889.0,4,1875
43255,2019,WY,Total Electric Power Industry,Other Gases,0.0,6,2539
43256,2019,WY,Total Electric Power Industry,Other,0.0,0,42
43257,2019,WY,Total Electric Power Industry,Petroleum,35196.0,6,21


In [28]:
emission_annual.to_csv(r'../data/Source of Data - eia.gov/from_python_csv/emission_annual.csv', index = False)