Original dataset : 

https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data




In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import holoviews as hv
hv.extension('bokeh')

import hvplot.pandas  # noqa
pd.options.plotting.backend = 'holoviews'

### Global Temperatures

In [2]:
df = pd.read_csv("data/kaggle/GlobalTemperatures.csv")

In [3]:
#df['Year'] = df['Date'].apply(lambda x:x[:4])
#df['Month'] = df['Date'].apply(lambda x:x[5:7])

df['year'] = df.dt.apply(lambda x:x[:4])
df['month'] = df.dt.apply(lambda x:x[5:7])

In [4]:
df[ (df['year'] >= '1850')  ].head(13)

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty,year,month
1200,1850-01-01,0.749,1.105,8.242,1.738,-3.206,2.822,12.833,0.367,1850,1
1201,1850-02-01,3.071,1.275,9.97,3.007,-2.291,1.623,13.588,0.414,1850,2
1202,1850-03-01,4.954,0.955,10.347,2.401,-1.905,1.41,14.043,0.341,1850,3
1203,1850-04-01,7.217,0.665,12.934,1.004,1.018,1.329,14.667,0.267,1850,4
1204,1850-05-01,10.004,0.617,15.655,2.406,3.811,1.347,15.507,0.249,1850,5
1205,1850-06-01,13.15,0.614,18.946,2.817,7.106,0.857,16.353,0.245,1850,6
1206,1850-07-01,14.492,0.614,19.233,2.84,8.014,0.786,16.783,0.238,1850,7
1207,1850-08-01,14.039,0.802,18.477,2.079,7.406,1.086,16.718,0.28,1850,8
1208,1850-09-01,11.505,0.675,15.846,2.692,4.533,1.798,15.886,0.254,1850,9
1209,1850-10-01,8.091,0.863,13.189,2.338,2.013,2.133,14.831,0.297,1850,10


In [5]:
df.loc[ (df['year'] >= '1850') & (df['month']=='07'), ['dt', 'LandAverageTemperature', 'LandMinTemperature', 'LandMaxTemperature']  ].hvplot()

In [6]:
subdf = df.loc[ (df['year'] >= '1850') & (df['month']=='07'), ['dt', 'LandAverageTemperature', 'LandMinTemperature', 'LandMaxTemperature']  ]

hv.Curve( subdf[ ['dt', 'LandAverageTemperature']  ]  )

In [7]:
subdf = df.loc[ (df['year'] >= '1850'), ['dt', 'LandAverageTemperature', 'year', 'month']  ]

hv.HeatMap(subdf, ["month", "year"], ["LandAverageTemperature", "dt"]).opts(width=600, height=600, xrotation=45, radial=True)

### Global Land Temperatures by Country

In [8]:
df = pd.read_csv("data/kaggle/GlobalLandTemperaturesByCountry_withContinents.csv")

df.head()

Unnamed: 0,index,dt,AverageTemperature,AverageTemperatureUncertainty,Country,year,month,Country_ccdf,continent
0,1274,1850-01-01,-9.083,1.834,Åland,1850,1,Åland,Europe
1,1275,1850-02-01,-2.309,1.603,Åland,1850,2,Åland,Europe
2,1276,1850-03-01,-4.801,3.033,Åland,1850,3,Åland,Europe
3,1277,1850-04-01,1.242,2.008,Åland,1850,4,Åland,Europe
4,1278,1850-05-01,7.92,0.881,Åland,1850,5,Åland,Europe


### How the dataset was built

It's built using the [`GlobalLandTemperaturesByCountry`](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data) dataset from Kaggle, and the ['country-and-continent-codes-list'](https://datahub.io/JohnSnowLabs/country-and-continent-codes-list) dataset from datahub.io

Unfold the following cells to rebuild it :

In [123]:
df = pd.read_csv("data/kaggle/GlobalLandTemperaturesByCountry.csv")
df['year'] = df.dt.apply(lambda x: int(x[:4]) )
df['month'] = df.dt.apply(lambda x: int(x[5:7]) )


# Exclude rows for entire continents
exclude = ["Africa","Asia","North America","Oceania","South America"]
df = df[ ~(df['Country'].isin(exclude))  ]

# Exclude rows duplicated for some Countries (like "France" and "France (Europe)")
df = df[ ~(df['Country'].str.contains("\(Europe\)"))  ] 

# Only keep data after 1850 (incl.)
df = df[ df['year'] >= 1850  ]

In [124]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country,year,month
1274,1850-01-01,-9.083,1.834,Åland,1850,1
1275,1850-02-01,-2.309,1.603,Åland,1850,2
1276,1850-03-01,-4.801,3.033,Åland,1850,3
1277,1850-04-01,1.242,2.008,Åland,1850,4
1278,1850-05-01,7.92,0.881,Åland,1850,5


In [125]:
# cc stands for "countries continents"
cc_df = pd.read_csv("data/country-and-continent-codes-list-csv_csv.csv")
cc_df['Country_Name_Short'] = cc_df['Country_Name'].apply(lambda x:  x.split(",")[0] if ',' in x else x  )

cc_df.head()

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number,Country_Name_Short
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0,Afghanistan
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0,Albania
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0,Antarctica (the territory South of 60 deg S)
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0,Algeria
4,Oceania,OC,American Samoa,AS,ASM,16.0,American Samoa


In [126]:
import difflib

closest_dict = {} # used as cache

def add_continent(x):
    global closest_dict
    
    # read from cache
    if x['Country'] in closest_dict:
        x['Country_ccdf'] = closest_dict[ x['Country']  ][0]
        x['continent'] = closest_dict[ x['Country']  ][1]
        return x
        
    closest = difflib.get_close_matches(x['Country'], cc_df['Country_Name_Short'])
    if len(closest) > 0:
        x['Country_ccdf'] = closest[0]
        x['continent'] = cc_df.loc[  cc_df['Country_Name_Short'] == closest[0], 'Continent_Name' ].values[0]

    else:
        x['Country_ccdf'] = None
        x['continent'] = None

    # write in cache
    closest_dict[ x['Country'] ] = [ x['Country_ccdf'], x['continent'] ]
    
    return x
    
# Takes a while despite the cache
df = df.apply( lambda x : add_continent(x),  axis=1)
print("done")

done


In [127]:
# manual fixes

fixes = {
    "Syria"          : ('Syria', 'Asia'),
    "South Korea"    : ('South Korea', 'Asia'),
    "Slovakia"       : ('Slovakia', 'Europe'),
    "Russia"         : ('Russia', 'Asia'),
    "North Korea"    : ('North Korea', 'Asia'),
    "Libya"          : ('Libya', 'Africa'),
    "Burma"          : ("Burma", "Asia"),
    "Kyrgyzstan"     : ("Kyrgyzstan", "Asia"),
    "Åland"          : ("Åland", "Europe"),
    "Baker Island"   : ("Baker Island", "Oceania"),
    "United Kingdom" : ("United Kingdom", "Europe"),
    "Congo (Democratic Republic Of The)" : ("Dem. Rep. of the Congo", "Africa"),    
}


for k, fix in fixes.items():
    df.loc[ (df['Country'] == k), ['Country_ccdf', 'continent'] ] = fix


In [128]:
# Remove all the rows for which we didn't find the country
# also removes the rows for some micro nations.
df = df[ ~(df['Country_ccdf'].isna()) ]


In [129]:
df.reset_index().to_csv("data/kaggle/GlobalLandTemperaturesByCountry_withContinents.csv", index=None)

----

In [9]:
df.head()

Unnamed: 0,index,dt,AverageTemperature,AverageTemperatureUncertainty,Country,year,month,Country_ccdf,continent
0,1274,1850-01-01,-9.083,1.834,Åland,1850,1,Åland,Europe
1,1275,1850-02-01,-2.309,1.603,Åland,1850,2,Åland,Europe
2,1276,1850-03-01,-4.801,3.033,Åland,1850,3,Åland,Europe
3,1277,1850-04-01,1.242,2.008,Åland,1850,4,Åland,Europe
4,1278,1850-05-01,7.92,0.881,Åland,1850,5,Åland,Europe


In [10]:
df['continent'].value_counts()

Africa           108549
Europe           106110
Asia              85774
North America     76623
Oceania           27300
South America     26940
Antarctica         3242
Name: continent, dtype: int64

In [11]:
df = df.reset_index(drop=True).sort_values(['Country', 'year', 'month'])

In [12]:
avg_temps = {}

# mean on the 30 first years of data for each country
for c, c_data in df[  ~(df['AverageTemperature'].isna())  ].groupby("Country") :
    avg_temps[c]  = np.mean(c_data['AverageTemperature'][30*12:])

In [13]:
# mean over the years, for each country
df_viz_raw = {"country":[], "continent":[], "year":[], "avg_temp_diff":[]}
for (country, continent, year), c_data in df[  ~(df['AverageTemperature'].isna())  ].groupby(["Country", "continent", "year"]) :
    
    df_viz_raw['country'].append(  country  )
    df_viz_raw['continent'].append(  continent  )
    df_viz_raw['year'].append(  year  )
    df_viz_raw['avg_temp_diff'].append(  np.mean(c_data['AverageTemperature']) - avg_temps[country]  )

df_viz = pd.DataFrame(data=df_viz_raw)

In [14]:
df_viz.describe()

Unnamed: 0,year,avg_temp_diff
count,35432.0,35432.0
mean,1934.374859,-0.102029
std,46.282862,0.603243
min,1850.0,-9.745856
25%,1895.0,-0.485769
50%,1935.0,-0.13621
75%,1974.0,0.255387
max,2013.0,3.7143


In [15]:
# Exclude all rows for which we don't have any data for 1900, but we have data for the last year

countries_1900 =  df_viz.loc[ (df_viz['year'] == 1900 ), "country"  ]

yearmax_filter = df_viz['year'] == 2000
in_1900_filter = df_viz['country'].isin(countries_1900)

excluded_countries = df_viz.loc[ (yearmax_filter) & ~(in_1900_filter) , "country" ]

df_viz = df_viz[ ~(df_viz['country'].isin(excluded_countries)) ]


In [15]:
# Base color scale, shading through white - not used anymore
qs_colors = [  "#613EA3","#6F50AB","#7E61B4","#8C73BC","#9A84C4","#A996CD","#B7A7D5","#C6B9DE","#D4CAE6","#E2DCEE","#F1EDF7",
               "#FFFFFF",
               "#F6FAEE","#ECF5DC","#E3F0CB","#DAEBBA","#D1E6A9","#C7E297","#BEDD86","#B5D875","#ACD364","#A2CE52","#99C941",]

qs_colors = qs_colors[::-1]


In [16]:
# Base color scale, shading through black - not used anymore
qs_colors = [  "#613EA3","#6F50AB","#7E61B4","#8C73BC","#9A84C4","#A996CD","#B7A7D5","#C6B9DE","#D4CAE6","#E2DCEE","#F1EDF7" ][::-1] + \
            ["#000000"] + \
            [ "#F6FAEE","#ECF5DC","#E3F0CB","#DAEBBA","#D1E6A9","#C7E297","#BEDD86","#B5D875","#ACD364","#A2CE52","#99C941" ][::-1]

qs_colors = qs_colors[::-1]


In [26]:
violet = "#452392"
plum = "#A43A8F"
green = "#99C941"

def hex_to_RGB(hex):
    return [int(hex[i:i+2], 16) for i in range(1,6,2)]

def RGB_to_hex(RGB):
    RGB = [int(x) for x in RGB]
    return "#"+"".join(["0{0:x}".format(v) if v < 16 else
                        "{0:x}".format(v) for v in RGB])


def color_scale(from_hex, to_hex, n=10):
    
    from_rgb = hex_to_RGB(from_hex)
    to_rgb = hex_to_RGB(to_hex)
    
    result = [from_hex]

    for i in range(1, n):
        new_color = [
            int(from_rgb[j] + (float(i)/(n-1))*(to_rgb[j]-from_rgb[j]))
            for j in range(3)
        ]

        result.append( RGB_to_hex(new_color))

    return result



In [23]:
from bokeh.themes.theme import Theme

# based on the "dark_minimal" Bokeh Theme : https://github.com/bokeh/bokeh/blob/branch-2.4/bokeh/themes/_dark_minimal.py
#
theme = Theme(
    json = {
    "attrs": {
        "Figure" : {
            "background_fill_color": "#20262B",
            "border_fill_color": "#15191C",
            "outline_line_color": "#333333",
            "outline_line_alpha": 0.25
        },

        "Grid": {
            "grid_line_color": "#333333",
            "grid_line_alpha": 0.25
        },

        "Axis": {
            "major_tick_line_alpha": 1,
            "major_tick_line_color": "#333333",

            "minor_tick_line_alpha": 1,
            "minor_tick_line_color": "#333333",

            "axis_line_alpha": 1,
            "axis_line_color": "#333333",

            "major_label_text_color": "#333333",
            "major_label_text_font": "Helvetica",
            "major_label_text_font_size": "0.8em",

            "axis_label_standoff": 10,
            "axis_label_text_color": "#333333",
            "axis_label_text_font": "Helvetica",
            "axis_label_text_font_size": "0.8em",
            "axis_label_text_font_style": "normal"
        },

        "Legend": {
            "spacing": 8,
            "glyph_width": 15,

            "label_standoff": 8,
            "label_text_color": "#333333",
            "label_text_font": "Helvetica",
            "label_text_font_size": "1.025em",

            "border_line_alpha": 0,
            "background_fill_alpha": 0.25,
            "background_fill_color": "#20262B"
        },

        "ColorBar": {
            "title_text_color": "#333333",
            "title_text_font": "Helvetica",
            "title_text_font_size": "1.025em",
            "title_text_font_style": "normal",

            "major_label_text_color": "#333333",
            "major_label_text_font": "Helvetica",
            "major_label_text_font_size": "1.025em",

            "background_fill_color": "#15191C",
            "major_tick_line_alpha": 0,
            "bar_line_alpha": 0
        },

        "Title": {
            "text_color": "#333333",
            "text_font": "Helvetica",
            "text_font_size": "1.15em"
        }
    }
})
hv.renderer('bokeh').theme = theme 

In [72]:
filter_americas = df_viz['continent'].isin(["North America", "South America"])
filter_europe = df_viz['continent'] == 'Europe'
filter_asia = df_viz['continent'] == 'Asia'
filter_africa = df_viz['continent'] == 'Africa'

title_template = ' %s : Temperature difference (°C) from the mean (Reference 1850-1880)'

#brand colors, not used anymore
#qs_colors = color_scale("#99C941", "#A43A8F", 20) + color_scale("#A43A8F", "#452392", 21)[1:]

# from deep violet #26086B to another shade of violet #663BC9
qs_colors = color_scale("#26086B", "#663BC9", 32)[::-1] 

# from deep violet #26086B to plum
#qs_colors = color_scale("#26086B", plum, 32)


def make_plot(filtr, area_name):

    subdf =  df_viz[ (df_viz['year'] > 1900) & (filtr)  ].sort_values('country', ascending=False)

    # We want the height to be 600px minimum.
    height = max(600, int(0.1 * len(subdf))) 
    width = 1440
    
    
    plot = hv.HeatMap(subdf, ["year", "country"], ["avg_temp_diff"]).opts( width=width, 
                                                                height=height, 
                                                                tools=['hover'],
                                                                colorbar_position='right',
                                                                clim=(-2, 3),
                                                                colorbar=True,
                                                                 cmap=qs_colors, 
                                                                title=title_template%(area_name),
                                                               ).redim.label(country=' ', year=' ')
    
    
    return plot

    # I wanted to save the plots automatically with : 
    #hv.save(plot, f'temp_diff_{area_name}', fmt='png')
    # but the DPI is too low and setting it doesn't have any effect with the bokeh backend.
    
    # So the solution is : output each plot in JupyterLab and use the save button


In [73]:
make_plot(filter_americas, "Americas")

In [74]:
make_plot(filter_europe, "Europe" )

In [75]:
make_plot(filter_asia, "Asia" )

In [76]:
make_plot(filter_africa, "Africa" )