Import libraries

In [None]:
import random
import pandas as pd
from datetime import datetime, timedelta

## Import evides dataset
evides = pd.read_csv('../2. Main Research/data/evides_withAIS.csv')
evides[['Wijk','ENI','Month_number']] = evides[['Wijk','ENI','Month_number']].astype('str')
evides.drop(columns=['Unnamed: 0'], inplace=True)
evides['Datum'] = pd.to_datetime(evides['Datum'])

Sample input parameters for transformation function.

In [None]:
configuration = {
    "CurrentLimit":6,
    "MaxQ":12, # max m^3 of drinking water per shipment
    "MaxD":7, # max amount of days between shipment
    "Pair_P":0.20, # probability treshold of conversion of a shipment pair
    "Individual_P":0.70 # probability treshold of conversion of a single shipment
}

In order to simulate a higher shipment limit, we transform the Evides dataset. We do this by looking at shipment pairs first.

In [None]:
shipment_pairs = pd.DataFrame(columns=['shipment 1','quantity 1','date 1','shipment 2','quantity 2','date 2','p','merge'])

for i in evides.index:
    # Get information for shipment 
    ENI = evides['ENI'].loc[i]
    AMOUNT = evides['Hoeveelheid (m3)'].loc[i]
    DATE = evides['Datum'].loc[i]

    # See if there are any matches based on these parameters
    min_amount = configuration["CurrentLimit"] - AMOUNT
    max_amount = configuration["MaxQ"] - AMOUNT # how much m3 left?
    date_range_min = DATE
    date_range_max = DATE + timedelta(days=configuration['MaxD']) # max date

    # Create subset
    subset = evides[(evides['ENI']==ENI) & (evides['Hoeveelheid (m3)'] > min_amount) & (evides['Hoeveelheid (m3)'] <= max_amount) & (evides['Datum'] > date_range_min) & (evides['Datum'] <= date_range_max)]
    if i in subset.index:
        subset = subset.drop(i,axis=0)

    if len(subset) > 0:
        for j in subset.index:
            index = str(i) + '_' + str(j)
            shipment_pairs.loc[index] = [i, AMOUNT, DATE, j, subset['Hoeveelheid (m3)'].loc[j], subset['Datum'].loc[j], None, 0]

# Add p and merge columns and fill them
shipment_pairs['p'] = [random.random() for x in range(len(shipment_pairs))]
shipment_pairs['merge'][shipment_pairs['p'] >= configuration['Pair_P']] = 1

shipment_pairs

We'll need to take into account that the same shipment can occur in multiple pairs. 

In [None]:
for i in shipment_pairs.index:
    # Get subset in which this index occurs
    subset = shipment_pairs[(shipment_pairs['shipment 1']==shipment_pairs['shipment 1'].loc[i]) 
                            | (shipment_pairs['shipment 1']==shipment_pairs['shipment 2'].loc[i]) 
                            | (shipment_pairs['shipment 2']==shipment_pairs['shipment 1'].loc[i]) 
                            | (shipment_pairs['shipment 2']==shipment_pairs['shipment 2'].loc[i])
                            ]
    
    if len(subset) > 1:
        indexes = subset.index
        
        # Set 'merge' to 0 for all in subset
        for i in indexes:
            shipment_pairs['merge'].loc[i] = 0

        # Except the first one
        shipment_pairs['merge'].loc[indexes[0]] = 1

        print(subset)

Great, now let's merge these shipments.

In [None]:
print("Amount of shipments in Evides dataset will be reduced by", len(shipment_pairs[shipment_pairs['merge']==1]), "shipments.")

for i in shipment_pairs.index:
    if shipment_pairs['merge'].loc[i] == 1:
        # Change M3 for the other
        evides['Hoeveelheid (m3)'].loc[shipment_pairs['shipment 1'].loc[i]] = shipment_pairs['quantity 1'].loc[i] + shipment_pairs['quantity 2'].loc[i]
        
        # Drop the other
        evides.drop(shipment_pairs['shipment 2'].loc[i], inplace=True)

We'll also need to convert individual shipments. When transforming individual shipments, we do not take into account any shipment which was included in the shipment_pairs table, regardless of whether they were converted or not in the end. 

In [None]:
pair_indexes = set(list(shipment_pairs['shipment 1']) + list(shipment_pairs['shipment 2']))

# And access the evides dataset excluding those indexes
for m in evides[(~evides.index.isin(pair_indexes)) & (evides['Hoeveelheid (m3)']==configuration['CurrentLimit'])].index:
    p = random.random()

    if p >= configuration["Individual_P"]:
        evides['Hoeveelheid (m3)'].loc[m] = configuration['MaxQ']

Save new dataset

In [None]:
evides.to_csv('data/evides_higherlimit.csv')