Due to an mistake in the initial data collection for temperatures we didn't use the correct initial temperatures. This notebook corrects this, creating a new main dataset with the correct temperatures.

In [1]:
# Dependencies.
import pandas as pd
import re

In [2]:
# Read in country dataset to get country list.
countries_df = pd.read_csv('../ds31_all_per_capita_emissions.csv')
countries_df.head(5)

Unnamed: 0,Country,Year,Life Ladder,Temperature,Clean Water,PM2.5,pc Greenhouse Gas Emissions,pc CO2 Emissions
0,Afghanistan,2010,4.758,14.629,48.28708,52.49585,0.001539,0.287738
1,Afghanistan,2011,3.832,16.487,50.82785,57.09972,0.001947,0.401953
2,Afghanistan,2012,3.783,14.373,53.40352,55.46611,0.002142,0.327922
3,Afghanistan,2013,3.572,16.156,56.01404,59.62277,0.002318,0.26157
4,Afghanistan,2014,3.131,15.647,58.65937,62.72192,0.002536,0.232968


In [3]:
# Get the country values in a list.
countries = countries_df['Country'].unique().tolist()
countries[:5]

['Afghanistan', 'Albania', 'Argentina', 'Armenia', 'Australia']

In [4]:
# Get the year values in a list.
years = countries_df['Year'].unique().tolist()
years.pop()
years

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [5]:
# Gather all the initial temperatures to add to main DataFrame.
init_temps = []

# Loop through each country to access the year of initial values.
for country in countries:
    # Access the text file for each country.
    filename = f'../be_text/{country}_temp.txt'
    with open(filename) as f:
        content = f.readlines()
        
    # Get the value for the month of June, and the whole year.
    init_temps.append([country, float(content[54].split()[6]), float(re.search(r'\(C\): (\S+) +', content[50]).group(1))])

In [6]:
# Check list of initial temperatures.
init_temps[:5]

[['Afghanistan', 24.69, 14.02],
 ['Albania', 19.9, 12.93],
 ['Argentina', 8.05, 14.75],
 ['Armenia', 15.93, 7.78],
 ['Australia', 15.22, 21.72]]

In [7]:
# Create DataFrame.
temps_df = pd.DataFrame(init_temps, columns=['Country', 'June Temp', 'Initial Temp'])
temps_df

Unnamed: 0,Country,June Temp,Initial Temp
0,Afghanistan,24.69,14.02
1,Albania,19.90,12.93
2,Argentina,8.05,14.75
3,Armenia,15.93,7.78
4,Australia,15.22,21.72
...,...,...,...
95,Uzbekistan,25.54,12.62
96,Venezuela,25.03,25.49
97,Vietnam,27.10,23.95
98,Yemen,30.89,26.10


In [8]:
# Missing dataset values.
tanz_temps = [21.3, 21.5, 20.9, 20.7, 21.3, 21.6, 21.2, 21.7, 20.9, 21.2, 'Tanzania']
us_temps = [17.9, 17.8, 18.3, 18.6, 17.5, 18.8, 19, 18.4, 18.6, 18.1, 'United States']

In [9]:
# Store all of the temperatures.
all_temps = []

# Lookup temperature values from the CSV files.
for i in range(len(temps_df)):
    if temps_df.iloc[i]["Country"] not in ['United States', 'Tanzania']:
        # Read in temperatures from CSV.
        temp = pd.read_csv(f'../be_csv/{temps_df.iloc[i]["Country"]}_{temps_df.iloc[i]["Initial Temp"]}_temps.csv')

        # Locate the June temps from the chosen years, add them to the June base temp and produce a list.
        june_temps = (temp.loc[(temp['Year'].isin(years)) & (temp['Month'] == 6)]['Monthly Anomaly'] + temps_df.iloc[i]["June Temp"]).round(3).to_list()
        june_temps.append(temps_df.iloc[i]["Country"])

    else:
        if temps_df.iloc[i]["Country"] == 'United States':
            june_temps = us_temps
        else:
            june_temps = tanz_temps
    
    # Store all the temp values and Country name to a new list.
    all_temps.append(june_temps)

In [10]:
# Check length of June temperatures found.
len(all_temps)

100

In [11]:
# Add Country column header.
years.append('Country')

In [12]:
# Create DataFrame from all temperatures.
june_temps_df = pd.DataFrame(all_temps, columns=years).set_index('Country')
june_temps_df.head()

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,25.299,27.157,25.043,26.826,26.317,26.097,26.46,26.836,26.612,24.626
Albania,20.749,21.171,22.956,20.797,20.701,20.772,21.724,22.52,21.229,22.751
Argentina,8.694,8.205,8.511,9.565,8.931,10.166,7.573,9.234,7.558,9.919
Armenia,19.238,17.333,18.388,17.199,17.872,19.369,17.3,17.945,18.005,20.055
Australia,15.588,14.667,14.712,15.982,15.63,16.599,16.488,15.675,15.515,15.436


In [13]:
# Stack the DataFrame to match current dataset organization.
stacked_df = pd.DataFrame(june_temps_df.stack(), columns=['June Temperatures'])
stacked_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,June Temperatures
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,2010,25.299
Afghanistan,2011,27.157
Afghanistan,2012,25.043
Afghanistan,2013,26.826
Afghanistan,2014,26.317


In [14]:
# Save the corrected temperatures.
stacked_df.to_csv('../output/corrected_temperatures.csv')

In [15]:
# Reset the temperature DF for merging.
reset_df = stacked_df.reset_index().rename(columns={'level_1': 'Year'})
reset_df.head()

Unnamed: 0,Country,Year,June Temperatures
0,Afghanistan,2010,25.299
1,Afghanistan,2011,27.157
2,Afghanistan,2012,25.043
3,Afghanistan,2013,26.826
4,Afghanistan,2014,26.317


In [16]:
# Create combined DF
combined_df = countries_df.merge(reset_df, how='left', on=['Country', 'Year'])
combined_df = combined_df[['Country', 'Year', 'Life Ladder', 'June Temperatures', 'Clean Water', 'PM2.5', 'pc Greenhouse Gas Emissions', 'pc CO2 Emissions']]
combined_df = combined_df.rename(columns={'June Temperatures': 'Temperature'})
combined_df

Unnamed: 0,Country,Year,Life Ladder,Temperature,Clean Water,PM2.5,pc Greenhouse Gas Emissions,pc CO2 Emissions
0,Afghanistan,2010,4.758,25.299,48.28708,52.49585,0.001539,0.287738
1,Afghanistan,2011,3.832,27.157,50.82785,57.09972,0.001947,0.401953
2,Afghanistan,2012,3.783,25.043,53.40352,55.46611,0.002142,0.327922
3,Afghanistan,2013,3.572,26.826,56.01404,59.62277,0.002318,0.261570
4,Afghanistan,2014,3.131,26.317,58.65937,62.72192,0.002536,0.232968
...,...,...,...,...,...,...,...,...
1095,Zimbabwe,2016,3.735,17.258,64.46896,25.39554,0.002076,0.770813
1096,Zimbabwe,2017,3.638,17.505,63.99662,22.61333,0.002023,0.719753
1097,Zimbabwe,2018,3.616,17.320,63.53877,22.10827,0.002173,0.785423
1098,Zimbabwe,2019,2.694,17.108,63.09496,20.84664,,0.708361


In [17]:
# Save corrected DataFrame.
combined_df.to_csv('../MAIN_all_per_cap.csv', index=False)