In [1]:
%autosave 0

Autosave disabled


In [5]:

import os
import pandas as pd
import numpy as np
import re
import collections
import pickle
from ast import literal_eval
import random
import scipy

#Define Path of Files

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# path for the 'Data' folder provided by DP
dir_loc = '/content/drive/Shareddrives/Private Unlimited Drive #1/DDS/Analytics Project/AOP_DP_Analytics/Data'

#Functions

In [None]:
# generate workflow for file storage /  instance creation

## Volume Matrix Manipulation and Poisson Instances Generation


In [8]:
# create dataframes for a region everyday

def generate_region_volume(region_name):
  # path of folder
  dir_region_instances = dir_loc + '/Instances/' + region_name
  dir_region_volume = dir_loc + '/Volumes/'
  # dir_region_districts = dir_region_instances + '/Districts'

  # load post_object-to-route_pos_id and post_point-to-post_object files 
  po_file = pd.read_csv(dir_region_instances + '/post_order_id_mapping.dat', sep='\t', names=('PostObjectId', 'RoutePosID'))
  pp_file = pd.read_csv(dir_region_instances + '/post_point_information.dat', sep='\t', names=('PostPointId', 'PostObjectId'))

  # pp_file adjustment by splitting list of post object ids
  pp_file['PostObjectId'] = pp_file['PostObjectId'].apply(literal_eval)
  pp_file = pp_file.explode('PostObjectId', ignore_index=True)

  # complete  post object list-district mapping
  region_district_df = generate_region_district(region_name = region_name)
  region_district_df.rename(columns = {'PostObjectID' : 'PostObjectId'}, inplace = True)

  # list of volume file paths
  day_names = []
  vol_path_list = []
  vol_day_map = {}
  day_map = {'mo' : 'Monday',
             'di' : 'Tuesday',
             'mi' : 'Wednesday',
             'do' : 'Thursday',
             'fr' : 'Friday',
             'sa' : 'Saturday'}

  for filename in os.listdir(dir_region_volume):
    vol_path_list.append(dir_region_volume + filename)
    day = filename[-6:-4]
    vol_day_map[filename] = day_map[day]

  # store dataframes of a region, map with post object id
  region_vol_day_dict = {}
  for vol_path in vol_path_list:
    vol_df = pd.read_csv(vol_path, sep = ';')
    vol_df.rename(columns = {'BRIEFE' : 'LETTERS',
                             'PAKETE' : 'PACKAGES',
                             'SONSTIGE' : 'OTHERS',
                             'ROUTEPOS_ID' : 'RoutePosID'},
                  inplace = True)
    
    # combining files to a complete table for a region
    vol_po_df = pd.merge(po_file, vol_df, on = 'RoutePosID', how = 'left')
    vol_po_df = pd.merge(pp_file, vol_po_df, on='PostObjectId', how='right')
    vol_po_df = pd.merge(region_district_df, vol_po_df, on='PostObjectId', how='right')

    # store dataframes in dict
    day_key = vol_day_map[vol_path[-18:]]
    region_vol_day_dict[day_key] = vol_po_df
    
  return region_vol_day_dict

In [9]:
def generate_region_district(region_name):
  district_path = dir_region_instances = dir_loc + '/Instances/' + region_name + '/Districts'
  region_districts_list = []

  for filename in os.listdir(district_path):
    file_var = re.sub('.dat', '', filename)
    file_district = pd.read_csv(district_path + '/' + filename, sep='\t', skiprows = [0,1],
                                names=('PostObjectID', 'dum_1', 'dum_2', 'dum_3', 'dum_4', 'dum_5')
                                # usecols = [0]
                                )
    file_district['district'] = file_var
    region_districts_list.append(file_district)

  region_districts_df = pd.concat(region_districts_list, ignore_index = True)
  region_districts_df.drop(['dum_1', 'dum_2', 'dum_3', 'dum_4', 'dum_5'], axis = 1, inplace = True)
  
  return region_districts_df

In [10]:
# generate instances

def generate_instances(region_vol_day, scenario_type, scenario_method, scenario_number, growth_factor):
  # df = region_vol_day.copy() #.copy() used to avoid recopying on the original dataframe
  df = region_vol_day

  sce_letters = 'scenario_' + str(scenario_number) + '_letter'
  sce_packages = 'scenario_' + str(scenario_number) + '_package'
  sce_others = 'scenario_' + str(scenario_number) + '_others'

  df[sce_letters] = df['LETTERS'].apply(lambda x :scenario_type(pos_delivery = x, method = scenario_method, rate = growth_factor ))
  df[sce_packages] = df['PACKAGES'].apply(lambda x :scenario_type(pos_delivery = x, method = scenario_method, rate = growth_factor))
  df[sce_others] = df['OTHERS'].apply(lambda x :scenario_type(pos_delivery = x, method = scenario_method, rate = growth_factor))

  sce_all = 'scenario_' + str(scenario_number) + '_all'
  df[sce_all] = df[sce_letters] + df[sce_packages] + df[sce_others]

  return df

In [38]:
# # generate instances
# VERSION 2

# def generate_instances_v2(region_vol_day, scenario_method, growth_factor):
#   # df = region_vol_day.copy() #.copy() used to avoid recopying on the original dataframe
#   df = region_vol_day

#   scenario_type = random_poisson_instances_v2(method = scenario_method, rate = growth_factor) --> 7, 9, 10


#   df['sce_letters_low'], df['sce_letters_medium'], df['sce_letters_high'] = zip(*df['LETTERS'].map(scenario_type))


#   # sce_all = 'scenario_' + str(scenario_number) + '_all'
#   # df[sce_all] = df[sce_letters] + df[sce_packages] + df[sce_others]

#   return df

In [11]:
# generate real instances based on poisson

def random_poisson_instances(pos_delivery, method, rate):
  rng = np.random.default_rng()
  poisson_dist = rng.poisson(lam = pos_delivery * (1 + rate), size = 52)

  if method == 'random':
    return random.choice(poisson_dist)
  if method == 'mode':
    return scipy.stats.mode(poisson_dist, keepdims = True)[0][0]
  else:
    return 'only options : [random, mode]'

In [34]:
# # generate real instances based on poisson
# VERSION 2

# def random_poisson_instances_v2(pos_delivery, method, rate):
#   rng = np.random.default_rng()
#   poisson_dist_low = rng.poisson(lam = pos_delivery * (1 - rate), size = 52)
#   poisson_dist_med = rng.poisson(lam = pos_delivery, size = 52)
#   poisson_dist_high = rng.poisson(lam = pos_delivery * (1 + rate), size = 52)

#   if method == 'random':
#     return random.choice(poisson_dist_low), random.choice(poisson_dist_med), random.choice(poisson_dist_high)
#   if method == 'mode':
#     return scipy.stats.mode(poisson_dist, keepdims = True)[0][0]
#   else:
#     return 'only options : [random, mode]'

In [20]:
# get start and end node for a district

def start_end_points(region_name, district):
    df_day_district = df_day[df_day['district'] == district]

    # get start and end node
    district_path = dir_loc + '/Instances/' + region_name + '/Districts/' + district + '.dat'
    rows_needed = [1]
    file_district = pd.read_csv(district_path, sep='\t', skiprows = lambda x : x not in rows_needed,
                                names=('dum_0', 'start_point', 'end_point', 'dum_1', 'dum_2', 'dum_3', 'dum_4', 'dum_5', 'dum_6')
                                )
    start_point = file_district['start_point'][0]
    end_point = file_district['end_point'][0]

    points = [start_point, end_point]

    return points

In [42]:
# generate distance matrix and mapping for a scenario

def generate_distance_matrix_map(region_name, df_day_district, points):

      col_use = ['PostPointId']
      col_use.append(scenario)
      df_day_district_scenario = df_day_district[col_use]
      
      # removing nodes with zero demand
      df_day_district_scenario_filtered = df_day_district_scenario[df_day_district_scenario[scenario] != 0]
      pp_id_day_district_scenario = df_day_district_scenario_filtered['PostPointId'].unique().tolist()
      
      # add start and end node if not in node list yet
      for point in points:
        if point not in pp_id_day_district_scenario:
          pp_id_day_district_scenario.append(point)

      # get distance file and dataframes
      distance_path = dir_loc + '/Instances/' + region_name + '/distances'
      district_distance = pd.read_csv(distance_path + '/distances_' + district + '.dat',
                                  names=['pp_1', 'pp_2', 'dist']
                                  )
      # remove unused postpoints
      district_distance_filtered = district_distance[(district_distance['pp_1'].isin(pp_id_day_district_scenario)) & (district_distance['pp_2'].isin(pp_id_day_district_scenario))]
      distance_matrix_df = district_distance_filtered.pivot(index = 'pp_1', columns = 'pp_2', values = 'dist')

      # distance matrix
      distance_matrix_array = distance_matrix_df.to_numpy()
      distance_matrix_array.tolist()

      # node mapping
      map_val = list(range(0, len(distance_matrix_df)))
      nodes = distance_matrix_df.index.values.tolist()
      mapping = dict(zip(map_val, nodes))

      return distance_matrix_array, mapping

#Process Examples (Using Functions)

In [14]:
%%time
# ['Warmsen', 'Uerze', 'Hannover 92']

region_vol_day_dict = generate_region_volume(region_name = "Warmsen")

CPU times: user 12.8 s, sys: 1.18 s, total: 14 s
Wall time: 25 s


In [15]:
region_vol_day_dict['Monday'].head()

Unnamed: 0,PostObjectId,district,PostPointId,RoutePosID,LETTERS,PACKAGES,OTHERS
0,16,31606-14,16,AD14E08623FF9F5AD2293E7DEDB4F4B1,0.45,0.293333,0.0
1,17,31606-14,17,F48CB0CAB0BC117AE030007F0100574C,0.45,0.293333,0.0
2,18,31606-14,16,F48CB0CAB0C7117AE030007F0100574C,0.45,0.293333,0.0
3,19,31606-14,18,F48CB0CAB0C8117AE030007F0100574C,0.0,0.0,0.0
4,20,31606-14,19,069AE286923C1800E040400A09131416,0.0,0.0,0.0


In [16]:
%%time
#Creating the complete dictionary of volumes for a region
warmsen_instances = {}

#INPUT Rate of mail change 
rate = 0.5
rate_mapping = {'low' : -rate, 'medium' : 0, 'high' : rate}

# create instances for one region
for key in region_vol_day_dict.keys():
  # create instances per day
  for key_rate in rate_mapping:
    dummy = generate_instances(region_vol_day = region_vol_day_dict[key],
                               scenario_type = random_poisson_instances,
                               scenario_method = 'random',
                               scenario_number = key_rate,
                               growth_factor = rate_mapping[key_rate])
  warmsen_instances[key] = dummy

CPU times: user 15.5 s, sys: 379 ms, total: 15.9 s
Wall time: 17.1 s


In [17]:
warmsen_instances['Tuesday'].head()

Unnamed: 0,PostObjectId,district,PostPointId,RoutePosID,LETTERS,PACKAGES,OTHERS,scenario_low_letter,scenario_low_package,scenario_low_others,scenario_low_all,scenario_medium_letter,scenario_medium_package,scenario_medium_others,scenario_medium_all,scenario_high_letter,scenario_high_package,scenario_high_others,scenario_high_all
0,16,31606-14,16,AD14E08623FF9F5AD2293E7DEDB4F4B1,5.796667,0.69,0.273333,5,1,0,6,7,1,0,8,10,2,0,12
1,17,31606-14,17,F48CB0CAB0BC117AE030007F0100574C,5.796667,0.69,0.273333,1,1,0,2,4,2,0,6,7,1,0,8
2,18,31606-14,16,F48CB0CAB0C7117AE030007F0100574C,5.796667,0.69,0.273333,2,1,0,3,4,2,0,6,5,0,0,5
3,19,31606-14,18,F48CB0CAB0C8117AE030007F0100574C,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
4,20,31606-14,19,069AE286923C1800E040400A09131416,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
print(warmsen_instances['Tuesday']['scenario_low_all'].sum())
print(warmsen_instances['Tuesday']['scenario_medium_all'].sum())
print(warmsen_instances['Tuesday']['scenario_high_all'].sum())

4671
9173
13515


In [19]:
z = warmsen_instances['Tuesday']


print(z[z['district'] == '31606-14']['scenario_low_all'].sum())
print(z[z['district'] == '31606-14']['scenario_medium_all'].sum())
print(z[z['district'] == '31606-14']['scenario_high_all'].sum())

292
558
783


In [None]:
# region_name = 'Warmsen'

# # filter by day
# for key in warmsen_instances.keys():
#   df_day = warmsen_instances[key]
#   # filter by district
#   for district in df_day['district'].value_counts().index.tolist():
#     print(district)
#     df_day_district = df_day[df_day['district'] == district]

#     # get start and end node
#     district_path = dir_loc + '/Instances/' + region_name + '/Districts/' + district + '.dat'
#     rows_needed = [1]
#     file_district = pd.read_csv(district_path, sep='\t', skiprows = lambda x : x not in rows_needed,
#                                 names=('dum_0', 'start_point', 'end_point', 'dum_1', 'dum_2', 'dum_3', 'dum_4', 'dum_5', 'dum_6')
#                                 )
#     start_point = file_district['start_point'][0]
#     end_point = file_district['end_point'][0]

#     points = [start_point, end_point]

#     scenario_list = df_day_district.columns.tolist()[7:]
#     # filter by scenario
#     for scenario in scenario_list:
#       col_use = ['PostPointId']
#       col_use.append(scenario)
#       df_day_district_scenario = df_day_district[col_use]
      
#       # removing nodes with zero demand
#       df_day_district_scenario_filtered = df_day_district_scenario[df_day_district_scenario[scenario] != 0]
#       pp_id_day_district_scenario = df_day_district_scenario_filtered['PostPointId'].unique().tolist()
      
#       # add start and end node if not in node list yet
#       for point in points:
#         if point not in pp_id_day_district_scenario:
#           pp_id_day_district_scenario.append(point)

#       # get distance file and dataframes
#       distance_path = dir_loc + '/Instances/' + region_name + '/distances'
#       district_distance = pd.read_csv(distance_path + '/distances_' + district + '.dat',
#                                   names=['pp_1', 'pp_2', 'dist']
#                                   )
#       # remove unused postpoints
#       district_distance_filtered = district_distance[(district_distance['pp_1'].isin(pp_id_day_district_scenario)) & (district_distance['pp_2'].isin(pp_id_day_district_scenario))]
#       distance_matrix_df = district_distance_filtered.pivot(index = 'pp_1', columns = 'pp_2', values = 'dist')

#       # distance matrix
#       distance_matrix_array = distance_matrix_df.to_numpy()

#       # node mapping
#       map_val = list(range(0, len(distance_matrix_df)))
#       nodes = distance_matrix_df.index.values.tolist()
#       mapping = dict(zip(map_val, nodes))

      


31603-06
31600-01
31606-12
31600-02
31604-09
31603-07
31604-10
31600-03
31603-05
31603-08
31606-11
31606-14
31606-13
31600-04
31603-06
31600-01
31606-12
31600-02
31604-09
31603-07
31604-10
31600-03
31603-05
31603-08
31606-11
31606-14
31606-13
31600-04
31603-06
31600-01


In [24]:
os.mkdir('test')

FileExistsError: ignored

In [26]:
os.mkdir('test/rev')

In [36]:
%%time

region = 'Warmsen'
save_loc = '/content/drive/Shareddrives/Private Unlimited Drive #1/DDS/Analytics Project/Coding/'

dir_region = save_loc + '/' + region
if not os.path.exists(dir_region):
  os.mkdir(dir_region)

# filter by day
for day in warmsen_instances.keys():
  df_day = warmsen_instances[day]

  dir_day = dir_region + '/' + day
  if not os.path.exists(dir_day):
    os.mkdir(dir_day)

  # filter by district
  for district in df_day['district'].value_counts().index.tolist():
    df_day_district = df_day[df_day['district'] == district]

    dir_district = dir_day + '/' + district
    if not os.path.exists(dir_district):
      os.mkdir(dir_district)

    scenario_list = df_day_district.columns.tolist()[7:]
    # print(scenario_list)

    # get start and end node
    points = start_end_points(region_name = region, district = district)
    
    # filter by scenario
    for scenario in scenario_list:
      # dir of mail
      mail_names = ['letter', 'package', 'others', 'all']
      for mail in mail_names:
        if mail in scenario:
          mail_current = mail
      dir_mail =  dir_district + '/' + mail_current
      # print(dir_mail)
      if not os.path.exists(dir_mail):
        os.mkdir(dir_mail)
      # dir of demand
      demand_types = ['low', 'medium', 'high']
      for demand in demand_types:
        if demand in scenario:
          demand_current = demand
      dir_demand =  dir_mail + '/' + demand_current
      if not os.path.exists(dir_demand):
        os.mkdir(dir_demand)
      # generate mapping and distance matrix
      distance_matrix, mapping = generate_distance_matrix_map(region_name = region, df_day_district = df_day_district, points = points)
      distance_matrix_name = 'dm_' + region + '_' + day + '_' + district + '_' + scenario
      mapping_name = 'map_' + region + '_' + day + '_' + district + '_' + scenario
      pickle.dump(distance_matrix, open(dir_demand + '/' + "%s.p"%distance_matrix_name, "wb"))
      pickle.dump(mapping, open(dir_demand + '/' + "%s.p"%mapping_name, "wb"))

      print(distance_matrix_name)


dm_Warmsen_Tuesday_31603-06_scenario_low_letter
dm_Warmsen_Tuesday_31603-06_scenario_low_package
dm_Warmsen_Tuesday_31603-06_scenario_low_others
dm_Warmsen_Tuesday_31603-06_scenario_low_all
dm_Warmsen_Tuesday_31603-06_scenario_medium_letter
dm_Warmsen_Tuesday_31603-06_scenario_medium_package
dm_Warmsen_Tuesday_31603-06_scenario_medium_others
dm_Warmsen_Tuesday_31603-06_scenario_medium_all
dm_Warmsen_Tuesday_31603-06_scenario_high_letter
dm_Warmsen_Tuesday_31603-06_scenario_high_package
dm_Warmsen_Tuesday_31603-06_scenario_high_others
dm_Warmsen_Tuesday_31603-06_scenario_high_all
dm_Warmsen_Tuesday_31600-01_scenario_low_letter
dm_Warmsen_Tuesday_31600-01_scenario_low_package
dm_Warmsen_Tuesday_31600-01_scenario_low_others
dm_Warmsen_Tuesday_31600-01_scenario_low_all
dm_Warmsen_Tuesday_31600-01_scenario_medium_letter
dm_Warmsen_Tuesday_31600-01_scenario_medium_package
dm_Warmsen_Tuesday_31600-01_scenario_medium_others
dm_Warmsen_Tuesday_31600-01_scenario_medium_all
dm_Warmsen_Tuesday_316

In [37]:
distance_matrix_name

'dm_Warmsen_Saturday_31600-04_scenario_high_all'

In [40]:
dm = pickle.load(open(dir_demand + "/%s.p"%distance_matrix_name, "rb"))
dm

array([[0.00000e+00, 9.29907e+03, 9.29913e+03, ..., 5.25025e+03,
        5.21760e+03, 5.21727e+03],
       [9.34253e+03, 0.00000e+00, 6.00000e-02, ..., 4.68993e+03,
        4.65728e+03, 4.65695e+03],
       [9.34259e+03, 6.00000e-02, 0.00000e+00, ..., 4.68999e+03,
        4.65734e+03, 4.65701e+03],
       ...,
       [5.24848e+03, 4.71327e+03, 4.71333e+03, ..., 0.00000e+00,
        8.98300e+01, 3.29800e+01],
       [5.21501e+03, 4.67980e+03, 4.67986e+03, ..., 8.98300e+01,
        0.00000e+00, 1.15770e+02],
       [5.21550e+03, 4.68029e+03, 4.68035e+03, ..., 3.29800e+01,
        1.15770e+02, 0.00000e+00]])

In [53]:
type(dm_v2[0][0])

numpy.int64

In [52]:
dm_v2 = np.rint(dm*100).astype(int)
dm_v2.tolist()

[[0,
  929907,
  929913,
  939843,
  940646,
  941367,
  940750,
  475472,
  474355,
  470824,
  470730,
  762740,
  763797,
  751239,
  750182,
  750122,
  744589,
  743472,
  767499,
  765676,
  764660,
  766794,
  766742,
  762017,
  764099,
  743681,
  745716,
  745609,
  743623,
  743518,
  741138,
  934863,
  906360,
  906767,
  906305,
  861779,
  862241,
  862239,
  857647,
  858107,
  858100,
  854816,
  855269,
  855227,
  852082,
  852493,
  471695,
  472277,
  471866,
  471348,
  471113,
  471053,
  469764,
  468284,
  468549,
  468138,
  466791,
  467202,
  620292,
  621009,
  620583,
  619466,
  621108,
  618941,
  618566,
  616399,
  621540,
  620423,
  620428,
  615788,
  614671,
  616313,
  614146,
  615684,
  614567,
  945085,
  944543,
  944038,
  941623,
  941416,
  941288,
  940553,
  272069,
  270952,
  271280,
  272397,
  272412,
  271911,
  273043,
  273207,
  285683,
  286800,
  940001,
  938360,
  936140,
  934808,
  935689,
  934862,
  933981,
  934749,
  933

In [49]:
mapping = pickle.load(open(dir_demand + "/%s.p"%mapping_name, "rb"))
mapping

{0: 0,
 1: 1536,
 2: 1537,
 3: 1539,
 4: 1540,
 5: 1541,
 6: 1542,
 7: 1543,
 8: 1544,
 9: 1545,
 10: 1546,
 11: 1547,
 12: 1548,
 13: 1549,
 14: 1550,
 15: 1551,
 16: 1552,
 17: 1553,
 18: 1554,
 19: 1555,
 20: 1556,
 21: 1557,
 22: 1558,
 23: 1559,
 24: 1560,
 25: 1562,
 26: 1563,
 27: 1564,
 28: 1566,
 29: 1567,
 30: 1568,
 31: 1571,
 32: 1572,
 33: 1573,
 34: 1574,
 35: 1575,
 36: 1576,
 37: 1577,
 38: 1578,
 39: 1579,
 40: 1580,
 41: 1581,
 42: 1582,
 43: 1583,
 44: 1584,
 45: 1585,
 46: 1587,
 47: 1588,
 48: 1589,
 49: 1594,
 50: 1595,
 51: 1596,
 52: 1599,
 53: 1600,
 54: 1601,
 55: 1602,
 56: 1603,
 57: 1604,
 58: 1605,
 59: 1606,
 60: 1607,
 61: 1608,
 62: 1609,
 63: 1610,
 64: 1613,
 65: 1614,
 66: 1615,
 67: 1616,
 68: 1617,
 69: 1618,
 70: 1619,
 71: 1620,
 72: 1621,
 73: 1622,
 74: 1623,
 75: 1625,
 76: 1626,
 77: 1627,
 78: 1630,
 79: 1631,
 80: 1632,
 81: 1633,
 82: 1634,
 83: 1635,
 84: 1636,
 85: 1637,
 86: 1638,
 87: 1639,
 88: 1640,
 89: 1641,
 90: 1642,
 91: 1643,
 