We will read in the Berkeley Earth text files, scraped from their website, and pull out the temperature data for each country. This data will be put into CSV files to allow for visualization, and also organized for machine learning.

In [1]:
# Dependencies.
import pandas as pd
import re

In [2]:
# Read in country dataset to get country list.
countries_df = pd.read_csv('countries_happiness_2010_2019_2021.csv')
countries_df.head()

Unnamed: 0,Country name,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2021
0,Afghanistan,4.758,3.832,3.783,3.572,3.131,3.983,4.22,2.662,2.694,2.375,2.523
1,Albania,5.269,5.867,5.51,4.551,4.814,4.607,4.511,4.64,5.004,4.995,5.117
2,Argentina,6.441,6.776,6.468,6.582,6.671,6.697,6.427,6.039,5.793,6.086,5.929
3,Armenia,4.368,4.26,4.32,4.277,4.453,4.348,4.325,4.288,5.062,5.488,5.283
4,Australia,7.45,7.406,7.196,7.364,7.289,7.309,7.25,7.257,7.177,7.234,7.183


In [3]:
# Get the country values in a list.
countries = countries_df['Country name'].tolist()
countries[:5]

['Afghanistan', 'Albania', 'Argentina', 'Armenia', 'Australia']

In [4]:
# Gather all the initial temperatures to add to main DataFrame.
init_temps = []

In [5]:
for country in countries:
    # Access the text file for each country.
    filename = f'be_text/{country}_temp.txt'
    with open(filename) as f:
        content = f.readlines()
    
    # Get the initial temperature the data is based upon.
    init_temp = float(re.search(r'\(C\): (\S+) +', content[50]).group(1))
    init_temps.append(init_temp)
    
    # Store every row of temperature data, splitting by whitespace.
    temp_data = []
    for row in content[71:]:
        sep = [x for x in re.split('\s+', row) if x]
        temp_data.append(sep)
    
    # Create a DataFrame to hold all the data.
    temp_df = pd.DataFrame(temp_data, columns=['Year',
                                               'Month',
                                               'Monthly Anomaly',
                                               'Monthly Uncertainty',
                                               'Annual Anomaly',
                                               'Annual Uncertainty',
                                               'Five Year Anomaly',
                                               'Five Year Uncertainty',
                                               'Ten Year Anomaly',
                                               'Ten Year Uncertainty',
                                               'Twenty Year Anomaly',
                                               'Twenty Year Uncertainty'])
    
    # Store the data in a CSV with name including the initial temperature.
    temp_df.to_csv(f'be_csv/{country}_{init_temp}_temps.csv', index=False)

In [6]:
# Gather country names and initial temperatures.
data_tuples = list(zip(countries, init_temps))
df_init_temps = pd.DataFrame(data_tuples, columns=['Country name', 'Initial Temperature'])
df_init_temps.set_index('Country name', inplace=True)
df_init_temps.head()

Unnamed: 0_level_0,Initial Temperature
Country name,Unnamed: 1_level_1
Afghanistan,14.02
Albania,12.93
Argentina,14.75
Armenia,7.78
Australia,21.72


In [7]:
# Add initial temperatures to main DataFrame.
countries_df.set_index('Country name', inplace=True)
full_df = countries_df.join(df_init_temps)
full_df.head()

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2021,Initial Temperature
Country name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,4.758,3.832,3.783,3.572,3.131,3.983,4.22,2.662,2.694,2.375,2.523,14.02
Albania,5.269,5.867,5.51,4.551,4.814,4.607,4.511,4.64,5.004,4.995,5.117,12.93
Argentina,6.441,6.776,6.468,6.582,6.671,6.697,6.427,6.039,5.793,6.086,5.929,14.75
Armenia,4.368,4.26,4.32,4.277,4.453,4.348,4.325,4.288,5.062,5.488,5.283,7.78
Australia,7.45,7.406,7.196,7.364,7.289,7.309,7.25,7.257,7.177,7.234,7.183,21.72


In [8]:
# Save main DataFrame.
full_df.to_csv('countries_happiness_init_temps.csv')

For ease of looking up further data from APIs and websites we also changed 'North Macedonia' to 'Macedonia', 'Taiwan Province of China' to 'Taiwan', and dropped 'Palestinian Territories'.