### Notebook: Evides

This notebook contains the exploratory analysis into all water shipments delivered by the supply boats of Evides. In the cell below, the necessary packages and data is loaded. In order to run the whole notebook on a different dataset, import it as 'evides' and make sure it stores the same information (with the same column names) as the original input. 

Note that all markdown descriptions in this notebook are based on the analysis that was run on the 2022 data. If the notebook is run again on different data, then these comments are no longer valid. 

In [None]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import matplotlib.cm
from datetime import datetime
from datetime import timedelta
from statistics import mean
import time
pd.options.mode.chained_assignment = None

# Import data and re-adjust it so that it is ready to use. Please refer to the cleaning notebook to understand why each of these steps is undertaken.
evides = pd.read_csv('../Data/Cleaned data/evides_cleaned.csv') 
evides = evides.drop(['Unnamed: 0'], axis=1)
evides[['Wijk','ENI','Month_number']] = evides[['Wijk','ENI','Month_number']].astype('str')
evides['Month'] = pd.Categorical(evides['Month'], categories=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], ordered=True)
evides = evides.sort_values(by='Datum')
evides.drop_duplicates(inplace=True)

# Add leading zero's back.
for i in evides['ENI']:
    # if len == 7, then immediately add the leading 0 to all instances of this ENI. 
    if len(i) == 7:
        evides['ENI'][evides['ENI']==i] = '0' + i

### Analysing the data
Now that we've taken care of NAs and invalid values, we can start looking at the data. Let's start with some general statistics.

In [None]:
# Data that we want to plot
months = evides['Month'].values.unique()
counts = [len(evides[evides['Month']==x]) for x in months]

# Plot
plt.figure(figsize=(14,9),dpi=100)
plt.bar(months, counts, color="steelblue", ec='black')
plt.ylabel('# Water shipments')
plt.xlabel('Month')
plt.title('Count of water shipments throughout the year')
plt.grid = True

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/count_watershipments_by_month.png', dpi=400)

And what about the total amount of liters?

In [None]:
# Data that we want to plot
months = evides['Month'].values.unique()
liters = [evides['Hoeveelheid (m3)'][evides['Month']==x].sum() for x in months]
liters_avg = [evides['Hoeveelheid (m3)'][evides['Month']==x].mean() for x in months]

# Plot
fig, ax = plt.subplots(1, 1, figsize=(14,9),dpi=100)

# Create months2
months2 = months.astype('object')

for i in range(len(months2)):
    months2[i] = months2[i] + "\n $n$=" + str(round(counts[i], 2))

# Plot
ax.bar(x=months2, height=liters, color="darkturquoise", ec='black')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(y) + ' $M^3$'))

plt.ylabel('Water quantity (in $M^3$)')
plt.xlabel('Month')
plt.title('Total amount of water delivered per month')
plt.grid = True

plt.show

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/total_liters_by_month.png', dpi=400)

And the average amount of liters per shipment?

In [None]:
# Data that we want to plot
months = evides['Month'].values.unique()
liters = [evides['Hoeveelheid (m3)'][evides['Month']==x].mean() for x in months]

# Plot
fig, ax = plt.subplots(1, 1, figsize=(14,9),dpi=100)

ax.bar(x=months2, height=liters, color="darkturquoise", ec='black')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))

plt.ylabel('Water quantity (in $M^3$)')
plt.xlabel('Month')
plt.title('Average amount of water delivered per shipment per month')
plt.grid = True

plt.show

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/average_liters_by_month.png', dpi=400)

How do the amounts delivered differ based on customer ship type?

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Scheepstype'].values):
    data[x] = [len(evides[(evides['Scheepstype'] == x) & (evides['Month']==y)]) for y in months]

# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(12)


for boolean, weight_count in data.items():
    p = ax.bar(months, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="Ship type")
ax.set_title('Count of water shipments by ship type')
ax.set_ylabel('# of shipments')
ax.set_xlabel('Month')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/count_watershipments_by_month_per_shiptype.png', dpi=400)

And what about the amount of water (in $M^3$) delivered?

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Scheepstype'].values):
    data[x] = [evides['Hoeveelheid (m3)'][(evides['Scheepstype'] == x) & (evides['Month']==y)].sum() for y in months]

# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(12)


for boolean, weight_count in data.items():
    p = ax.bar(months, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="Ship type")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))
ax.set_title('Total amount of water delivered per shiptype')
ax.set_ylabel('Water quantity (in $M^3$)')
ax.set_xlabel('Month')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/total_liters_by_month_per_shiptype.png', dpi=400)

So, what about the counts divided by district?

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Wijk'].values):
    data[x] = [len(evides[(evides['Wijk'] == x) & (evides['Month']==y)]) for y in months]

# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(12)

for boolean, weight_count in data.items():
    p = ax.bar(months, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="District")
ax.set_title('Count of water shipments')
ax.set_ylabel('# of shipments')
ax.set_xlabel('Month')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/count_watershipments_by_month_per_district.png', dpi=400)

And the total liters divided by division?

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Wijk'].values):
    data[x] = [evides['Hoeveelheid (m3)'][(evides['Wijk'] == x) & (evides['Month']==y)].sum() for y in months]

# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(12)


for boolean, weight_count in data.items():
    p = ax.bar(months, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="District")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))
ax.set_title('Total amount of water delivered per district')
ax.set_ylabel('Water quantity (in $M^3$)')
ax.set_xlabel('Month')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/total_liters_by_month_per_district.png', dpi=400)

What about the average amount of water delivered per district?

In [None]:
# Data that we want to plot
liters = evides.groupby('Wijk')['Hoeveelheid (m3)'].mean()
divisions = []
for i in range(len(liters)):
    divisions.append("District " + str(liters.index[i]))
    

# Plot
fig, ax = plt.subplots(1, 1, figsize=(14,9),dpi=100)

ax.bar(x=divisions, height=liters.values, color="darkturquoise", ec='black')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))

plt.ylabel('Water quantity (in $M^3$)')
plt.xlabel('District')
plt.title('Average amount of water delivered per district')
plt.grid = True

plt.show

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/average_liters_by_district.png', dpi=400)

In [None]:
liters

Interestingly enough, we find that the average amount of water delivered per district differs signficantly for district 1. 

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Scheepstype'].values):
    data[x] = [len(evides[(evides['Wijk'] == y) & (evides['Scheepstype']==x)]) for y in ['1','2','3','4']]
    
# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(4)

# We'll reuse the divisions list made in the cell above.
for boolean, weight_count in data.items():
    p = ax.bar(divisions, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="Ship type")
ax.set_title('Count of water shipments by ship type and per district')
ax.set_ylabel('# of shipments')
ax.set_xlabel('District')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/count_watershipments_by_month_per_district_per_shiptype.png', dpi=400)

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Scheepstype'].values):
    data[x] = [evides['Hoeveelheid (m3)'][(evides['Wijk'] == y) & (evides['Scheepstype']==x)].sum() for y in ['1','2','3','4']]

# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(4)

# We'll reuse the divisions list made in the cell above.
for boolean, weight_count in data.items():
    p = ax.bar(divisions, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="Ship type")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))
ax.set_title('Total amount of water delivered per district')
ax.set_ylabel('Water quantity (in $M^3$)')
ax.set_xlabel('District')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/total_liters_by_district_per_shiptype.png', dpi=400)

What is causing this significant difference between district 1 and district 2? 

In [None]:
# Define refugee boats
refugee_boats = ['02326758','02311635', '07001727', '07000661', '07001417', '05111630', '07001515']

# Define refugee_boat column
evides['Refugee_boat'] = 'Other'
evides['Refugee_boat'][evides['ENI'].isin(refugee_boats)] = "Refugee boat"
evides['Refugee_boat'][(~evides['ENI'].isin(refugee_boats)) & (evides['Scheepstype'] == 'Passagiersschip, opleidingschip')] = 'Other passenger ship'

In [None]:
# Data that we want to plot
data = {}
for x in set(evides['Refugee_boat'].values):
    data[x] = [evides['Hoeveelheid (m3)'][(evides['Refugee_boat'] == x) & (evides['Month']==y)].sum() for y in months]

# Plot
fig, ax = plt.subplots(figsize=(16,9),dpi=100)
bottom = np.zeros(12)

fig.suptitle("Total amount of water delivered per month", y = 0.95)

for boolean, weight_count in data.items():
    p = ax.bar(months, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=4, fancybox=3, shadow=True, title="Ship type")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))
ax.set_title('Passenger ships (including refugee boats) account for {s1}% of the total water delivered, yet account for only {s2}% of total shipments.'.format(
    s1 = round(evides['Hoeveelheid (m3)'][(evides.Refugee_boat!="Other")].sum() / evides['Hoeveelheid (m3)'].sum() * 100, 2),
    s2 = round(evides['Hoeveelheid (m3)'][(evides.Refugee_boat!="Other")].count() / evides['Hoeveelheid (m3)'].count() * 100, 2)
    ))
ax.set_ylabel('Water quantity (in $M^3$)')
ax.set_xlabel('Month')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/total_liters_by_refugees.png', dpi=400)

Given these large quantities, refugee boats appear to have quite an effect on the water shipments. This is something that will need to be taken into account. Let's take a further look at the distribution of shipments. 

In [None]:
# Create new column called 'Shipment_size' which determines whether a shipment is larger than 6 
evides['Shipment_size'] = 'Larger than 6 $M^3$'
evides['Shipment_size'].loc[evides['Hoeveelheid (m3)'] <= 6] = 'Smaller or equal to 6 $M^3$'

In [None]:
# Data that we want to plot
data = {}
for i in set(evides['Scheepstype'].values):
    data[i] = []
    for j in set(evides['Shipment_size']):
        data[i].append(evides['Hoeveelheid (m3)'][(evides['Scheepstype'] == i) & (evides['Shipment_size'] == j)].values.sum())

# Define categories
categories = list(set(evides['Shipment_size']))

for i in range(0, len(categories)):
    categories[i] = categories[i] + "\n N={n} ({per}%)".format(
        n=evides['Hoeveelheid (m3)'][evides.Shipment_size == categories[i]].count(),
        per=round(evides['Hoeveelheid (m3)'][evides.Shipment_size == categories[i]].count() / evides['Hoeveelheid (m3)'].count() * 100, 2))

# Plot
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)
bottom = np.zeros(2)

# We'll reuse the divisions list made in the cell above.
for boolean, weight_count in data.items():
    p = ax.bar(categories, weight_count, 0.5, label=boolean, ec='black', bottom=bottom)
    bottom += weight_count

box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])

ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=2, fancybox=3, shadow=True, title="Ship type")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.1f}'.format(int(y)) + ' $M^3$'))
ax.set_title('Total amount of water delivered per shiptype')
ax.set_ylabel('Water quantity (in $M^3$)')
ax.set_xlabel('Type of shipment')

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/total_liters_by_shiptype_and_shipment_type.png', dpi=400)

This is quite interesting. We observe that each ship category is represented almost entirely (if not fully) in only one of the shipment size categories. This information could be crucially important in deciding ship routes, since the types of ships that are served on a route has a significant impact on the amount of water that the ship will need. Now, let's take a look at the amount of boats used per day.

In [None]:
dates = list(set(evides['Datum'].values)) # We'll store dates here
active_ships = [] # And the amount of active ships per date here

# Add ship counts
for i in dates:
    active_ships.append(len(set(evides['Waterboot'][evides.Datum == i])))

counts = [active_ships.count(x) for x in set(active_ships)]
categories = [str(x) for x in set(active_ships)]

for i in range(0, len(categories)):
    categories[i] = categories[i] + "\n N={n} ({per}%)".format(
        n=counts[i],
        per=round(counts[i] / sum(counts) * 100, 2))

# Plot
plt.figure(figsize=(14,9),dpi=100)
plt.bar(categories, counts, color="gold", ec='black')
plt.ylabel('# days')
plt.xlabel('Amount of ships')
plt.title('Amount of active ships on a given day')
plt.grid = True

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/count_activeships.png', dpi=400)

It is quite unexpected to see days on which only one or two boats were active. Let's dive into these days a bit more. 

In [None]:
# Get dates for which this is true
dates_of_interest = []

for i in range(len(active_ships)):
    if active_ships[i] < 4:
        dates_of_interest.append(dates[i])

df = pd.DataFrame(0, columns=['Weekday','Amount of water supplied'], index=dates_of_interest)
df['Weekday'] = [datetime.strptime(x, '%Y-%m-%d') for x in df.index]
df['Weekday'] = [datetime.strftime(x, '%A') for x in df['Weekday']]        
df['Amount of water supplied'] = [evides['Hoeveelheid (m3)'][evides['Datum'] == x].sum() for x in df.index]

df

This table perfectly explains each of the days on which a low amount of ships were active.
* Saturdays and sundays are not part of the regular working days.
* Monday 06-06-2022 is 'Tweede Pinksterdag', a holiday.
* Friday 18-02-2022 saw a very bad storm called 'Eunice'. 

Now, let's take a look at the used capacities of each boat. First, let us define the water tank capacities of each boat. 

In [None]:
capacities = {
    "Waterbuffel":103,
    "Watergeus":103,
    "Waterman":83,
    "Waterval":83,
    "Watervogel":140
}

Then, we'll have to do some calculations in order to find out the used capacities throughout the year. We can filter out the dates found above, since they contain special days with very low numbers. 

In [None]:
evides = evides[~evides['Datum'].isin(dates_of_interest)]

In [None]:
# Initialize empty dictionary
data = {}

# Create 
for i in list(set(evides['Datum'].values)):
    data[i] = {'Liters':{}, 'Capacity':{}}

    for j in list(set(evides['Waterboot'].values)):
       liters = evides['Hoeveelheid (m3)'][(evides['Waterboot']==j) & (evides['Datum']==i)].sum()
       data[i]['Liters'][j] = liters
       if liters > 0:
           data[i]['Capacity'][j] = round(liters / capacities[j], 2) 
       else:
           data[i]['Capacity'][j] = None
data


df = pd.DataFrame(0, columns=['Capacity (in M3)','Active days','Average daily capacity','Average total daily water amount (in M3)'], index=list(set(evides['Waterboot'].values)))

Now, let's use this data to create some plots! We'll start with the Waterman!

In [None]:
# Create hist data
hist_data = [data[date]['Capacity']['Waterman'] for date in list(set(evides['Datum'].values)) if data[date]['Capacity']['Waterman'] != None]

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

ax.hist(hist_data, ec="black", color="cornflowerblue", bins=25) 
ax.set_title("Distribution of the daily water tank capacity usage of the Waterman (DWS 14) \n (Water capacity = {l} $m^3$, mean daily capacity = {meancap}, active days = {ndays})".format(
    l=capacities['Waterman'], 
    meancap=round(mean(hist_data),2),
    ndays=len(hist_data)
    ))
ax.set_xlabel("Water tank capacity")
ax.set_ylabel("# Days")
ax.grid = True

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/capacity_histogram_water_Waterman.png', dpi=400)

# Also add information to the dataframe
boat = 'Waterman'
liter_data = [data[date]['Liters'][boat] for date in list(set(evides['Datum'].values)) if data[date]['Liters'][boat] not in [None, 0]]
df.loc[boat] = [capacities[boat], len(hist_data), round(mean(hist_data),2), round(mean(liter_data),2)]

Next, the Waterbuffel!

In [None]:
# Create hist data
hist_data = [data[date]['Capacity']['Waterbuffel'] for date in list(set(evides['Datum'].values)) if data[date]['Capacity']['Waterbuffel'] != None]

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

ax.hist(hist_data, ec="black", color="cornflowerblue", bins=25)
ax.set_title("Distribution of the daily water tank capacity usage of the Waterbuffel (DWS 11) \n (Water capacity = {l} $m^3$, mean daily capacity = {meancap}, active days = {ndays})".format(
    l=capacities['Waterbuffel'], 
    meancap=round(mean(hist_data),2),
    ndays=len(hist_data)
    ))
ax.set_xlabel("Water tank capacity")
ax.set_ylabel("# Days")

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/capacity_histogram_water_Waterbuffel.png', dpi=400)

# Also add information to the dataframe
boat = 'Waterbuffel'
liter_data = [data[date]['Liters'][boat] for date in list(set(evides['Datum'].values)) if data[date]['Liters'][boat] not in [None, 0]]
df.loc[boat] = [capacities[boat], len(hist_data), round(mean(hist_data),2), round(mean(liter_data),2)]

And the Watergeus!

In [None]:
# Create hist data
hist_data = [data[date]['Capacity']['Watergeus'] for date in list(set(evides['Datum'].values)) if data[date]['Capacity']['Watergeus'] != None]

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

ax.hist(hist_data, ec="black", color="cornflowerblue", bins=25)
ax.set_title("Distribution of the daily water tank capacity usage of the Watergeus (DWS 10) \n (Water capacity = {l} $m^3$, mean daily capacity = {meancap}, active days = {ndays})".format(
    l=capacities['Watergeus'], 
    meancap=round(mean(hist_data),2),
    ndays=len(hist_data)
    ))
ax.set_xlabel("Water tank capacity")
ax.set_ylabel("# Days")

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/capacity_histogram_water_Watergeus.png', dpi=400)

# Also add information to the dataframe
boat = 'Watergeus'
liter_data = [data[date]['Liters'][boat] for date in list(set(evides['Datum'].values)) if data[date]['Liters'][boat] not in [None, 0]]
df.loc[boat] = [capacities[boat], len(hist_data), round(mean(hist_data),2), round(mean(liter_data),2)]

And the Waterval

In [None]:
# Create hist data
hist_data = [data[date]['Capacity']['Waterval'] for date in list(set(evides['Datum'].values)) if data[date]['Capacity']['Waterval'] != None]

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

ax.hist(hist_data, ec="black", color="cornflowerblue", bins=25)
ax.set_title("Distribution of the daily water tank capacity usage of the Waterval (DWS 15) \n (Water capacity = {l} $m^3$, mean daily capacity = {meancap}, active days = {ndays})".format(
    l=capacities['Waterval'], 
    meancap=round(mean(hist_data),2),
    ndays=len(hist_data)
    ))
ax.set_xlabel("Water tank capacity")
ax.set_ylabel("# Days")

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/capacity_histogram_water_Waterval.png', dpi=400)

# Also add information to the dataframe
boat = 'Waterval'
liter_data = [data[date]['Liters'][boat] for date in list(set(evides['Datum'].values)) if data[date]['Liters'][boat] not in [None, 0]]
df.loc[boat] = [capacities[boat], len(hist_data), round(mean(hist_data),2), round(mean(liter_data),2)]

And finally, the Watervogel.

In [None]:
# Create hist data
hist_data = [data[date]['Capacity']['Watervogel'] for date in list(set(evides['Datum'].values)) if data[date]['Capacity']['Watervogel'] != None]

# Calculate statistic
print("Amount of days for which the watervogel used up less than 0.72 of its capacity:", str(round(len([x for x in hist_data if x <= 0.72]) / len(hist_data), 4) * 100))

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

ax.hist(hist_data, ec="black", color="cornflowerblue", bins=25)
ax.set_title("Distribution of the daily water tank capacity usage of the Watervogel (DWS 12) \n (Water capacity = {l} $m^3$, mean daily capacity = {meancap}, active days = {ndays})".format(
    l=capacities['Watervogel'], 
    meancap=round(mean(hist_data),2),
    ndays=len(hist_data)
    ))
ax.set_xlabel("Water tank capacity")
ax.set_ylabel("# Days")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.0f}'.format(int(y))))

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/capacity_histogram_water_Watervogel.png', dpi=400)

# Also add information to the dataframe
boat = 'Watervogel'
liter_data = [data[date]['Liters'][boat] for date in list(set(evides['Datum'].values)) if data[date]['Liters'][boat] not in [None, 0]]
df.loc[boat] = [capacities[boat], len(hist_data), round(mean(hist_data),2), round(mean(liter_data),2)]

We've also created a table that summarizes the information in the plots above.

In [None]:
df

Finally, let's take a look at the amount of boats per district

In [None]:
wijken = pd.DataFrame(0, index=list(set(evides['Wijk'].values)), columns=["1 boat", "2 boats", "3 boats", "4 boats", "5 boats"])

for i in list(set(evides['Wijk'].values)):
    n_boats = []
    for j in list(set(evides['Datum'].values)):
        n = len(set(evides['Waterboot'][(evides['Datum']==j) & (evides['Wijk']==i)].values))
        n_boats.append(n)

    wijken.loc[i] = [n_boats.count(n) for n in range(1,6)]

pd.options.mode.chained_assignment = None
evides['Datum'] = [datetime.strptime(x, '%Y-%m-%d') for x in evides.Datum] # In order to make comparisons, we'll need to use a datetime format

wijken.index = ['District 1', 'District 2', 'District 3', 'District 4']   
wijken

The results make some sense, since most of the boats start and end their day in district 1. However, even in the other districts there are quite some days on which two, or even three boats are active. This is something to consider when answering the research questions. 

Let's create some more interesting tables. This table shows some insights with regards to the different metrics for different vessel types. 

In [None]:
boten = ['Waterbuffel', 'Watergeus', 'Waterman', 'Waterval', 'Watervogel']
df = pd.DataFrame(0, columns=['Amount of shipments', 'Share in total shipments', 'Total amount of water supplied', 'Share in total amount of water supplied', 'Mean amount of water per shipment'], index=list(set(evides['Scheepstype'].values)))

for i in df.index:
    df.loc[i] = [len(evides[evides['Scheepstype']==i]), 
                 round(len(evides[evides['Scheepstype']==i]) / len(evides), 4) * 100,
                 evides['Hoeveelheid (m3)'][evides['Scheepstype']==i].sum(),
                 round(evides['Hoeveelheid (m3)'][evides['Scheepstype']==i].sum() / evides['Hoeveelheid (m3)'].sum(), 4) * 100,
                 round(evides['Hoeveelheid (m3)'][evides['Scheepstype']==i].mean(), 2)]
    
df.loc['Total'] = [df[x].sum() for x in df]
df

And another one. A table that shows the % of water deliverd per district and ship type (in $M^3$). 

In [None]:
df = pd.DataFrame(0, columns=['District 1', 'District 2', 'District 3', 'District 4'], index=list(set(evides['Scheepstype'].values)))

for i in df.index:
    df.loc[i] = [round(evides['Hoeveelheid (m3)'][(evides['Scheepstype']==i) & (evides['Wijk']=='1')].sum() / evides['Hoeveelheid (m3)'][evides['Wijk']=='1'].sum(), 4) * 100,
                round(evides['Hoeveelheid (m3)'][(evides['Scheepstype']==i) & (evides['Wijk']=='2')].sum() / evides['Hoeveelheid (m3)'][evides['Wijk']=='2'].sum(), 4) * 100,
                round(evides['Hoeveelheid (m3)'][(evides['Scheepstype']==i) & (evides['Wijk']=='3')].sum() / evides['Hoeveelheid (m3)'][evides['Wijk']=='3'].sum(), 4) * 100,
                round(evides['Hoeveelheid (m3)'][(evides['Scheepstype']==i) & (evides['Wijk']=='4')].sum() / evides['Hoeveelheid (m3)'][evides['Wijk']=='4'].sum(), 4) * 100]
    
df.loc['Total'] = [df[x].sum() for x in df]
df

And one more. Same table, but showing the % of water shipments per district and ship type. 

In [None]:
df = pd.DataFrame(0, columns=['District 1', 'District 2', 'District 3', 'District 4'], index=list(set(evides['Scheepstype'].values)))

for i in df.index:
    df.loc[i] = [round(len(evides[(evides['Scheepstype']==i) & (evides['Wijk']=='1')]) / len(evides[evides['Wijk']=='1']), 4) * 100, 
                round(len(evides[(evides['Scheepstype']==i) & (evides['Wijk']=='2')]) / len(evides[evides['Wijk']=='2']), 4) * 100,
                round(len(evides[(evides['Scheepstype']==i) & (evides['Wijk']=='3')]) / len(evides[evides['Wijk']=='3']), 4) * 100,
                round(len(evides[(evides['Scheepstype']==i) & (evides['Wijk']=='4')]) / len(evides[evides['Wijk']=='4']), 4) * 100]
    
df.loc['Total'] = [df[x].sum() for x in df]
df

Let's also look at the capacities on days with 4 boats active and with 5 boats active, seperately.

In [None]:
# Initialize variables
dates = list(set(evides['Datum'].values)) # We'll store dates here
split_capacities = {
                    'Waterbuffel':{4:[], 5:[]},
                    'Watergeus':{4:[], 5:[]},
                    'Waterman':{4:[], 5:[]},
                    'Waterval':{4:[], 5:[]},
                    'Watervogel':{4:[], 5:[]}
                    }

# Add ship counts
for i in dates:
    n_boats = len(set(evides['Waterboot'][evides.Datum == i]))

    for j in boten:
        cap = round(evides['Hoeveelheid (m3)'][(evides['Waterboot'] == j) & (evides['Datum'] == i)].sum() / capacities[j], 2)

        if cap > 0:
            split_capacities[j][n_boats].append(cap)

        else:
            pass


split_capacities 

In [None]:
# Now pull the information out of the dict
row1 = [round(mean(split_capacities[j][4]), 2) for j in boten]
row2 = [round(mean(split_capacities[j][5]), 2) for j in boten]

df = pd.DataFrame([row1,row2], index=[4,5], columns=boten)
df

No significant differences here! What about on days where the watervogel specifically is inactive? 

In [None]:
split_capacities = {
                    'Waterbuffel':[],
                    'Watergeus':[],
                    'Waterman':[],
                    'Waterval':[],
                    'Watervogel':[]
                    }

for i in dates:
    n_boats = len(set(evides['Waterboot'][evides.Datum == i]))
    cap_watervogel = round(evides['Hoeveelheid (m3)'][(evides['Waterboot'] == 'Watervogel') & (evides['Datum'] == i)].sum() / capacities[j], 2)

    if n_boats == 4 and cap_watervogel == 0:
        for j in boten:
            cap = round(evides['Hoeveelheid (m3)'][(evides['Waterboot'] == j) & (evides['Datum'] == i)].sum() / capacities[j], 2)

            if cap > 0:
                split_capacities[j].append(cap)

            else:
                pass

# Add it to our previous table
row3 = [round(mean(split_capacities[j]), 2) for j in ['Waterbuffel', 'Watergeus', 'Waterman', 'Waterval']]
row3.append(None)
df.loc['Watervogel (DWS 12) inactive'] = row3

df

Let's turn it into a line plot. 

In [None]:
# Get data
data = {}

# Create 
for i in list(set(evides['Datum'].values)):
    data[i] = {'Liters':{}, 'Capacity':{}}

    for j in list(set(evides['Waterboot'].values)):
       liters = evides['Hoeveelheid (m3)'][(evides['Waterboot']==j) & (evides['Datum']==i)].sum()
       data[i]['Liters'][j] = liters
       if liters > 0:
           data[i]['Capacity'][j] = round(liters / capacities[j], 2) 
       else:
           data[i]['Capacity'][j] = None
data

# Initialize dataframe
df = pd.DataFrame(0, columns=boten, index=dates)

for i in dates:
    df.loc[i] = [data[i]['Capacity'][boat] for boat in boten]

df['Day'] = [x.date() for x in df.index]
df['Cap'] = [1 for x in df.index]
df['Month'] = [x.strftime('%B') for x in df.index]
df = df.sort_values(by='Day', ascending=True)

Now, let's create our line plot. 

In [None]:
for i in months:    
    ig, ax = plt.subplots(figsize=(16,9),dpi=100)

    ax.plot(df['Day'][df['Month']==i], df['Waterman'][df['Month']==i], label="Waterman", linestyle='none', marker='o', color='c')
    ax.plot(df['Day'][df['Month']==i], df['Watervogel'][df['Month']==i], label="Watervogel", linestyle='none', marker='o', color='b')
    ax.plot(df['Day'][df['Month']==i], df['Waterbuffel'][df['Month']==i], label="Waterbuffel", linestyle='none', marker='o', color='m')
    ax.plot(df['Day'][df['Month']==i], df['Waterval'][df['Month']==i], label="Waterval", linestyle='none', marker='o', color='y')
    ax.plot(df['Day'][df['Month']==i], df['Watergeus'][df['Month']==i], label="Watergeus", linestyle='none', marker='o', color='k')
    ax.plot(df['Day'][df['Month']==i], df['Cap'][df['Month']==i], label="Capacity of 1.0", linestyle='solid', color='r')

    ax.legend(loc="upper center", bbox_to_anchor=(0.5, -0.10), ncol=3, fancybox=3, shadow=True, title="Water boat")
    ax.set_xlabel('Dates')
    ax.set_ylabel('Capacity')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0 + box.height * 0.15,
                 box.width, box.height * 0.85])
    
    ax.set_title('Capacity per day and water supply boat - {month}'.format(month=i))
    ax.grid(True)

    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
    plt.gca().xaxis.set_major_locator(mdates.DayLocator())

    plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=1))
    plt.gcf().autofmt_xdate()
    plt.margins(x=0.005)

    plt.savefig('../Plots/Evides/Capacity dotplots/dotplot_{month}'.format(month=i), dpi=400)

Finally, let's take a look at frequently occurring shipments. 

In [None]:
def ProduceShipmentPlots(ENI):
    print("ENI: {eni}".format(eni=ENI))

    #dates_of_interest = [(datum + timedelta(days=n)) for n in range(0, n+1)]
    #  & evides['Datum'].isin(dates_of_interest)
    
    # Find subset of data
    subset = evides[(evides['ENI']==ENI)]

    ig, ax = plt.subplots(figsize=(16,9),dpi=100)

    ax.plot(subset['Datum'], subset['Hoeveelheid (m3)'], linestyle='none', marker='o', color='b')
    ax.set_xlabel('Weeks')
    ax.set_ylabel('Water amount (in $m^3$)')

    ax.set_title('Shipments per boat - ENI {boat}'.format(boat=ENI))
    ax.grid(True)

    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%d-%m-%Y'))
    plt.gca().xaxis.set_major_locator(mdates.DayLocator())
    plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=7))
    plt.gcf().autofmt_xdate()
    plt.margins(x=0.005)

    plt.savefig('../Plots/Evides/Individual ships/ENI_{month}'.format(month=ENI), dpi=400)

# And test it!
ProduceShipmentPlots('02005415')

Now let's recreate this function once more to calculate the dates between shipments. 

In [None]:
def CalculateDistances(ENI):
    print("ENI: {eni}".format(eni=ENI))

    # Find subset of data
    subset = evides[(evides['ENI']==ENI)]
    dates = subset['Datum'].values

    records = {}

    for i in range(1, len(dates)):
        string = "[{date1}, {date2}]".format(date1=dates[i], date2=dates[i-1])
        diff = (pd.to_datetime(dates[i])-pd.to_datetime(dates[i-1]))
        amount = subset['Hoeveelheid (m3)'][subset['Datum'].isin([dates[i], dates[i-1]])].sum()

        records[string] = {"days":diff.days, "combined_amount":amount}

    return(records)
    
# And test it!
CalculateDistances(evides['ENI'][0])

Great, now let's run this for all individual boats in the dataset.

In [None]:
distances = {}

for i in set(evides['ENI']):
    # Calculate distances
    x = CalculateDistances(i)

    # Discard ENI if they've only received one shipment in the full year
    if len(x) != 0:
        distances[i] = CalculateDistances(i)

distances

Now, let's extract the information we need.

In [None]:
df = pd.DataFrame(0, columns= [1, 2, 3, 4, 5, 6, 7], index=[7, 8, 9, 10, 11, 12])

n_days = []
for i in distances:
    for j in distances[i]:
        days = int(distances[i][j]['days'])
        amount = int(distances[i][j]['combined_amount'])

        if amount <= 12 and days <= 7 and amount > 6 and days > 0:
            df.at[amount, days] = df.at[amount, days] + 1

df

With this table, it's important to note the following.
* It only considers pairs of shipments. It's very well possible that three shipments happened within one week, for a total of 13 liters. In this case, this is not recorded. 
* [7,1] having the value 3 means that on 3 days, 7 liters were provided over two shipments with one day between the shipments. 
* Oddly enough, there are also quite some ships who already received more than 6 $m^3$ on the same day. 

Let's dive into this a bit deeper. 

In [None]:
df['N'] = [round(x,2) for x in df.sum(axis=1)]
df['%'] = round(df['N'] / sum(df['N']), 4) * 100
df['N cum'] = [sum(df['N'][0:x]) for x in range(1, len(df)+1)]
df['% cum'] = [sum(df['%'][0:x]) for x in range(1, len(df)+1)]

df

Add some more distribution plots for the individual ships

In [None]:
# Create hist data
hist_data = [x for x in evides['Hoeveelheid (m3)'][evides['Scheepstype']=='Containerschip']]
bar_data_xaxis = [x for x in set(hist_data)]
bar_data_yaxis = [len([x for x in hist_data if x == y]) for y in bar_data_xaxis]

# How many shipments in total below or equal to 6?
p_sixorbelow = sum([x for x in bar_data_yaxis])/len(hist_data) * 100
p_sixorbelow = round(p_sixorbelow, 2)

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

binwidth = 1
ax.bar(x=bar_data_xaxis, height=bar_data_yaxis, ec="black", color="mediumspringgreen")
ax.set_title("Distribution of drinking water quantity (in $m^3$) supplied to vessels of type 'container ship'")
ax.set_xlabel("Water quantity (in $m^3$)")
ax.set_ylabel("Amount of shipments")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.0f}'.format(int(y))))

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/shipments_containership.png', dpi=400)

In [None]:
# Create hist data
hist_data = [x for x in evides['Hoeveelheid (m3)'][evides['Scheepstype']=='Tanker']]
bar_data_xaxis = [x for x in set(hist_data) if x <= 6]
bar_data_yaxis = [len([x for x in hist_data if x == y]) for y in bar_data_xaxis] 

# How many shipments in total below or equal to 6?
p_sixorbelow = sum([x for x in bar_data_yaxis])/len(hist_data) * 100
p_sixorbelow = round(p_sixorbelow, 2)

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

binwidth = 1
ax.bar(x=bar_data_xaxis, height=bar_data_yaxis, ec="black", color="orangered")
ax.set_title("Distribution of drinking water quantity (in $m^3$) supplied to vessels of type 'tanker' \n This plot includes {p} % of total shipments to this vessel type.".format(p=p_sixorbelow))
ax.set_xlabel("Water quantity (in $m^3$)")
ax.set_ylabel("Amount of shipments")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.0f}'.format(int(y))))

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/shipments_tanker.png', dpi=400)

In [None]:
# Create hist data
hist_data = [x for x in evides['Hoeveelheid (m3)'][evides['Scheepstype']=='Beunschip (zand en grind)']]
bar_data_xaxis = [x for x in set(hist_data) if x <= 6]
bar_data_yaxis = [len([x for x in hist_data if x == y]) for y in bar_data_xaxis] 

# How many shipments in total below or equal to 6?
p_sixorbelow = sum([x for x in bar_data_yaxis])/len(hist_data) * 100
p_sixorbelow = round(p_sixorbelow, 2)

# Plot hist data
fig, ax = plt.subplots(figsize=(10, 12),dpi=100)

binwidth = 1
ax.bar(x=bar_data_xaxis, height=bar_data_yaxis, ec="black", color="yellow")
ax.set_title("Distribution of drinking water quantity (in $m^3$) supplied to vessels of type 'beunschip' (barge) \n This plot includes {p} % of total shipments to this vessel type.".format(p=p_sixorbelow))
ax.set_xlabel("Water quantity (in $m^3$)")
ax.set_ylabel("Amount of shipments")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: 
'{:.0f}'.format(int(y))))

# Then save a high-quality version of this image
plt.savefig('../Plots/Evides/shipments_beunschip.png', dpi=400)

Since we've made some more changes to the data, let's save it again. 

In [None]:
evides.to_csv('../Data/Cleaned data/evides_cleaned2.csv')