# Charlie Harper, Visualizing Data with Bokeh and Pandas


This notebook is based on followin tutorial by Charlie Harper:
[Visualizing Data with Bokeh and Pandas](https://programminghistorian.org/en/lessons/visualizing-with-bokeh)


"In this lesson you will learn how to visually explore and present data in Python by using the Bokeh and Pandas libraries."


* [Documentation](https://docs.bokeh.org/en/latest/docs/user_guide/output/jupyter.html#ug-output-jupyter) of the interactive library Bokeh
* [Ten minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)


In [38]:
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

import pandas as pd



In [2]:
x = [1, 3, 5, 7]
y = [2, 4, 6, 8]


In [3]:
output_file('output_pages/my_first_graph.html')


In [6]:
p = figure()

### beware: argument 'legend' is deprecated
# https://discourse.bokeh.org/t/bokehdeprecationwarning-legend-keyword-is-deprecated-use-explicit-legend-label-legend-field-or-legend-group-keywords-instead/8066
### 'circle() method with size value' was deprecated in Bokeh 3.4.0 
p.scatter(x, y, size=10, color='red', legend_label='circle')
p.line(x, y, color='blue', legend_label='line')
### 'triangle() method' was deprecated in Bokeh 3.4.0 
p.scatter(y, x, marker='triangle', color='gold', size=10, legend_label='triangle')
p.legend.click_policy='hide'
show(p)

### Import the data to be analyzed

In [12]:
df = pd.read_csv('data/thor_wwii.csv')
df.head()


Unnamed: 0,MSNDATE,THEATER,COUNTRY_FLYING_MISSION,NAF,UNIT_ID,AIRCRAFT_NAME,AC_ATTACKING,TAKEOFF_BASE,TAKEOFF_COUNTRY,TAKEOFF_LATITUDE,TAKEOFF_LONGITUDE,TGT_COUNTRY,TGT_LOCATION,TGT_LATITUDE,TGT_LONGITUDE,TONS_HE,TONS_IC,TONS_FRAG,TOTAL_TONS
0,03/30/1941,ETO,GREAT BRITAIN,RAF,84 SQDN,BLENHEIM,10.0,,,,,ALBANIA,ELBASAN,41.1,20.07,0.0,0.0,0.0,0.0
1,11/24/1940,ETO,GREAT BRITAIN,RAF,211 SQDN,BLENHEIM,9.0,,,,,ALBANIA,DURAZZO,41.32,19.45,0.0,0.0,0.0,0.0
2,12/04/1940,ETO,GREAT BRITAIN,RAF,211 SQDN,BLENHEIM,9.0,,,,,ALBANIA,TEPELENE,40.3,20.02,0.0,0.0,0.0,0.0
3,12/31/1940,ETO,GREAT BRITAIN,RAF,211 SQDN,BLENHEIM,9.0,,,,,ALBANIA,VALONA,40.47,19.49,0.0,0.0,0.0,0.0
4,01/06/1941,ETO,GREAT BRITAIN,RAF,211 SQDN,BLENHEIM,9.0,,,,,ALBANIA,VALONA,40.47,19.49,0.0,0.0,0.0,0.0


In [34]:
df.groupby(by='COUNTRY_FLYING_MISSION').size()

COUNTRY_FLYING_MISSION
AUSTRALIA          316
GREAT BRITAIN    31361
NEW ZEALAND        633
SOUTH AFRICA        19
USA              94165
dtype: int64

In [16]:
### Print the list of the columns in the CSV
# You will then use them to extract the data
df.columns.tolist() 

['MSNDATE',
 'THEATER',
 'COUNTRY_FLYING_MISSION',
 'NAF',
 'UNIT_ID',
 'AIRCRAFT_NAME',
 'AC_ATTACKING',
 'TAKEOFF_BASE',
 'TAKEOFF_COUNTRY',
 'TAKEOFF_LATITUDE',
 'TAKEOFF_LONGITUDE',
 'TGT_COUNTRY',
 'TGT_LOCATION',
 'TGT_LATITUDE',
 'TGT_LONGITUDE',
 'TONS_HE',
 'TONS_IC',
 'TONS_FRAG',
 'TOTAL_TONS']

In [35]:
### Preparer the HTML output file and the data source
output_file('output_pages/columndatasource_example.html')
sample = df.sample(50)
source = ColumnDataSource(sample)


In [39]:
output_notebook()

In [40]:
p = figure()
p.scatter(x='TOTAL_TONS', y='AC_ATTACKING',
         source=source,
         size=5, color='green')
p.title.text = 'Attacking Aircraft and Munitions Dropped'
p.xaxis.axis_label = 'Tons of Munitions Dropped'
p.yaxis.axis_label = 'Number of Attacking Aircraft'

hover = HoverTool()
hover.tooltips=[
    ('Attack Date', '@MSNDATE'),
    ('Attacking Aircraft', '@AC_ATTACKING'),
    ('Tons of Munitions', '@TOTAL_TONS'),
    ('Type of Aircraft', '@AIRCRAFT_NAME')
]

p.add_tools(hover)

show(p)


In [51]:
grouped = (df.groupby('COUNTRY_FLYING_MISSION')[['TOTAL_TONS', 'TONS_HE', 'TONS_IC', 'TONS_FRAG']].sum()/1000).astype(int)
print(grouped)


                        TOTAL_TONS  TONS_HE  TONS_IC  TONS_FRAG
COUNTRY_FLYING_MISSION                                         
AUSTRALIA                        0        0        0          0
GREAT BRITAIN                 1112      868      209          1
NEW ZEALAND                      2        4        0          0
SOUTH AFRICA                     0        0        0          0
USA                           1625     1297      205        127


In [46]:
pd.crosstab(df.COUNTRY_FLYING_MISSION, df.TGT_COUNTRY)

TGT_COUNTRY,"""PAPUA NEW GUINEA, MANUS ISLAND""",ALBANIA,ALEUTIAN ISLANDS,ALGERIA,ANDAMAN ISLANDS,AUSTRALIA,AUSTRIA,BALI,BELGIUM,BISMARK ARCHIPELAGO,...,SYRIA,THAILAND OR SIAM,THAILAND OR SIAM MINING,TIMOR,TUNISIA,UNKNOWN,UNKNOWN OR NOT INDICATED,VOLCANO AND BONIN ISLANDS,WAKE ISLAND,YUGOSLAVIA
COUNTRY_FLYING_MISSION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AUSTRALIA,0,0,0,0,0,4,0,0,0,26,...,2,0,0,98,0,0,0,0,0,0
GREAT BRITAIN,0,109,0,1,2,0,12,0,1024,0,...,107,15,0,0,72,0,544,0,0,25
NEW ZEALAND,0,0,0,0,0,0,0,0,0,234,...,0,0,0,0,0,0,0,0,0,0
SOUTH AFRICA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
USA,3,41,239,0,6,0,3439,19,56,3224,...,0,57,1,8,1166,19,1278,966,72,767
