# Extracting important info from data

## Imports

In [1]:
import pandas as pd

## Reading Dataset

In [2]:
df=pd.read_csv("co2Data/d1.csv")
df.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_other_co2,share_of_temperature_change_from_ghg,temperature_change_from_ch4,temperature_change_from_co2,temperature_change_from_ghg,temperature_change_from_n2o,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,,,,,,,
1,Afghanistan,1851,AFG,3767956.0,,,,,,,...,,0.165,0.0,0.0,0.0,0.0,,,,
2,Afghanistan,1852,AFG,3783940.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
3,Afghanistan,1853,AFG,3800954.0,,,,,,,...,,0.164,0.0,0.0,0.0,0.0,,,,
4,Afghanistan,1854,AFG,3818038.0,,,,,,,...,,0.163,0.0,0.0,0.0,0.0,,,,


In [3]:
df=df[['country','year','population','co2','co2_per_capita']]

In [4]:
df.year

0        1850
1        1851
2        1852
3        1853
4        1854
         ... 
50593    2017
50594    2018
50595    2019
50596    2020
50597    2021
Name: year, Length: 50598, dtype: int64

### Latest data is from 2021, we are going to select all data from that year

In [5]:
# Dropping null values
df=df.dropna()

newDf=df.groupby('country',as_index=False).apply(lambda cnt:cnt[cnt['year']==cnt['year'].max()])
newDf.head()

Unnamed: 0,Unnamed: 1,country,year,population,co2,co2_per_capita
0,171,Afghanistan,2021,40099460.0,11.874,0.296
1,443,Africa,2021,1393677000.0,1450.796,1.041
2,959,Albania,2021,2854710.0,4.619,1.618
3,1131,Algeria,2021,44177960.0,176.269,3.99
4,1575,Andorra,2021,79057.0,0.453,5.729


In [6]:
df.head()

Unnamed: 0,country,year,population,co2,co2_per_capita
99,Afghanistan,1949,7356890.0,0.015,0.002
100,Afghanistan,1950,7480464.0,0.084,0.011
101,Afghanistan,1951,7571542.0,0.092,0.012
102,Afghanistan,1952,7667534.0,0.092,0.012
103,Afghanistan,1953,7764549.0,0.106,0.014


In [7]:
newDf.head()

Unnamed: 0,Unnamed: 1,country,year,population,co2,co2_per_capita
0,171,Afghanistan,2021,40099460.0,11.874,0.296
1,443,Africa,2021,1393677000.0,1450.796,1.041
2,959,Albania,2021,2854710.0,4.619,1.618
3,1131,Algeria,2021,44177960.0,176.269,3.99
4,1575,Andorra,2021,79057.0,0.453,5.729


In [8]:
newDf = newDf.reset_index(drop=True)

In [9]:
newDf.head()

Unnamed: 0,country,year,population,co2,co2_per_capita
0,Afghanistan,2021,40099460.0,11.874,0.296
1,Africa,2021,1393677000.0,1450.796,1.041
2,Albania,2021,2854710.0,4.619,1.618
3,Algeria,2021,44177960.0,176.269,3.99
4,Andorra,2021,79057.0,0.453,5.729


In [10]:
newDf.shape

(232, 5)

As per the data notebook or guide,
#### *CO2* is in *Million Tonnes*
#### *CO2_per_capita* is in *Tonnes*

Thus we need to convert the CO2 to Tonnes

In [11]:
def convertion(num):
    return 1000000*num

newDf['co2']=newDf['co2'].apply(convertion)

In [12]:
newDf

Unnamed: 0,country,year,population,co2,co2_per_capita
0,Afghanistan,2021,4.009946e+07,1.187400e+07,0.296
1,Africa,2021,1.393677e+09,1.450796e+09,1.041
2,Albania,2021,2.854710e+06,4.619000e+06,1.618
3,Algeria,2021,4.417796e+07,1.762690e+08,3.990
4,Andorra,2021,7.905700e+04,4.530000e+05,5.729
...,...,...,...,...,...
227,Wallis and Futuna,2021,1.165400e+04,2.800000e+04,2.387
228,World,2021,7.909295e+09,3.712385e+10,4.694
229,Yemen,2021,3.298164e+07,1.247700e+07,0.378
230,Zambia,2021,1.947313e+07,7.676000e+06,0.394


In [13]:
with open("newfl.txt","a") as fl:
    lines=newDf.country
    for line in lines:
        fl.write(" "+line+"   ,")

### Removing the rows which are not actually countries

In [14]:
NotCountries=['Asia', 'Europe', 'High-income countries',
    'Low-income countries', 'Lower-middle-income countries',
    'North America', 'Oceania', 'South America', 'Upper-middle-income countries',
    'World']

newDf = newDf[~newDf['country'].isin(NotCountries)]

This also includes regions like Wallis and futuna which are not actually countries but parts of countries. This has even precise data, so its better not to be removed.

In [15]:
# Checking if any old countries exist, which may currently not exist or are not countries.
# or have very very old data
print(set(newDf.year))

{1920, 2021}


Removing 1920 row(s) as it is a very old year, and may not be a proper data.

In [16]:
newDf=newDf[newDf['year']!=1920]

In [17]:
newDf.head()

Unnamed: 0,country,year,population,co2,co2_per_capita
0,Afghanistan,2021,40099460.0,11874000.0,0.296
1,Africa,2021,1393677000.0,1450796000.0,1.041
2,Albania,2021,2854710.0,4619000.0,1.618
3,Algeria,2021,44177960.0,176269000.0,3.99
4,Andorra,2021,79057.0,453000.0,5.729


### Exporting the Filtered out and required data

In [18]:
newDf.to_csv("required_data.csv")