In [1]:
import os
import numpy    as np
import pandas   as pd
import psycopg2 as psy

from datetime import datetime as dt
from random   import randrange
from random   import sample
from random   import choice

  """)


In [2]:
#''' Check if file exists on path. Move existing file to ARCH. Create new input for office_dim.'''


In [4]:
# Setup script input parameters

min_obs = 2                      # minimal number of observation in output dataset
max_obs = 5                     # maximal number of observation in output dataset

min_street_number = 1            # minimal street number
max_street_number = 25           # maximal street number

min_local_number = 1            # minimal street number
max_local_number = 100          # maximal street number

obs = randrange(min_obs,max_obs) # number of observation
obs

3

In [5]:
main_path    = "Documents/CallCenterStaffing"
path_to_arch = "Arch/office"
path_to_file = "Input"
path_to_dict = "Dict"
file_name    = "office_dim.csv"
city_dict    = "city_dict.csv"
street_dict  = "street_dict.csv"

now          = dt.now()

path_to_file = os.path.join(os.path.expanduser("~"),main_path,path_to_file,file_name)
path_to_arch = os.path.join(os.path.expanduser("~"),main_path,path_to_arch)
city_dict    = os.path.join(os.path.expanduser("~"),main_path,path_to_dict,city_dict)
street_dict  = os.path.join(os.path.expanduser("~"),main_path,path_to_dict,street_dict)

def is_path_not_correct(path):
    return not(os.path.exists(path))

def is_path_correct(path):
    return os.path.exists(path)

if is_path_not_correct(path_to_file):
    raise Exception("Input file doesn't exist")

if is_path_not_correct(path_to_arch):
    raise Exception("Archive directory doesn't exist")

if is_path_not_correct(city_dict):
    raise Exception("City dictionary doesn't exist")
    
if is_path_not_correct(street_dict):
    raise Exception("Street dictionary doesn't exist")
    
time_stamp = "_" + str(now)[:10]
arch_file_name = 'arch_' + file_name.replace(".csv",time_stamp + ".csv")
 
os.rename(path_to_file, os.path.join(path_to_arch, arch_file_name))

In [6]:
# connection to database and return list of column column_list
try:
    connection = psy.connect( user = "szymonbocian",
                              password = "",
                              host = "localhost",
                              port = "5432",
                              database = "dwh_call_center")

    cursor = connection.cursor()

    query = "SELECT column_name FROM INFORMATION_SCHEMA.columns WHERE table_schema = 'stg' AND table_name = 'office_dim';"
    cursor.execute(query)
    column_list = cursor.fetchall()
    
    query = "SELECT DISTINCT office_key FROM pro.office_dim;"
    cursor.execute(query)
    office_list = cursor.fetchall()
    
    query = "SELECT office_key FROM pro.office_dim WHERE office_hier_key IS NULL;"
    cursor.execute(query)
    office_hier_list = cursor.fetchall()
    
except (Exception, psy.Error) as error :
    print ("Error while connecting to PostgreSQL", error)
finally:
        if(connection):
            cursor.close()
            connection.close()

column_list = [''.join(c for c in s if c not in '(),') for s in column_list]

a = list()
for row in office_list:
    a.append(row[0])

office_list = a

a = list()
for row in office_hier_list:
    a.append(row[0])

office_hier_list = a

print(column_list)
print(office_list)
print(office_hier_list)

['old_office_key', 'office_hier_key', 'office_name', 'address_street_name', 'address_street_number', 'address_local_number', 'location_latitude', 'location_longitude', 'office_business_key']
[1]
[1]


In [7]:
# Load dictionary with city and street names
dict_city = pd.read_csv( city_dict,
                         sep = ";",
                         usecols = ['city_name'],
                         skiprows = 0
                       )
dict_street = pd.read_csv( street_dict,
                           sep = ";",
                           usecols = ['street_name'],
                           skiprows = 0 
                         )

In [8]:
df = pd.DataFrame({
    "old_office_key" : np.random.choice([choice(office_list), ""], obs, True, [0.025, 0.975])
})

df['office_hier_key']       = np.random.choice(office_hier_list)
df['office_name']           = sample(["Call Center in " + c + " city" for c in dict_city.city_name], k = obs)
df['address_street_name']   = sample([s + "'s st." for s in dict_street.street_name], k = obs)
df['address_street_number'] = pd.Series(np.round(np.random.uniform(min_street_number,max_street_number,obs),0)).astype(int)
df['address_local_number']  = pd.Series(np.round(np.random.uniform(min_local_number,max_local_number,obs),0)).astype(int)
df['location_latitude']     = np.round(np.random.uniform(-90,90,obs),4)
df['location_longitude']    = np.round(np.random.uniform(-180,180,obs),4)

df

Unnamed: 0,old_office_key,office_hier_key,office_name,address_street_name,address_street_number,address_local_number,location_latitude,location_longitude
0,,1,Call Center in Jarocin city,Mickiewicz's st.,12,86,-57.1105,6.9246
1,,1,Call Center in Kohln city,Chopin's st.,7,96,53.5842,165.6507
2,,1,Call Center in Tokio city,Himilsbach's st.,21,38,-72.3618,51.0793


In [9]:
# Create office business key
on  = df.office_name.map(str).str.upper()
asn = df.address_street_name.map(str).str.upper()
sn  = ('0' + (df.address_street_number).astype(str))
ln  = ('0' + (df.address_local_number).astype(str))

df['office_business_key'] = on.str.slice(0,3) + asn.str.slice(0,3) + [s[-2:] for s in sn] + [l[-2:] for l in ln]
df

Unnamed: 0,old_office_key,office_hier_key,office_name,address_street_name,address_street_number,address_local_number,location_latitude,location_longitude,office_business_key
0,,1,Call Center in Jarocin city,Mickiewicz's st.,12,86,-57.1105,6.9246,CALMIC1286
1,,1,Call Center in Kohln city,Chopin's st.,7,96,53.5842,165.6507,CALCHO0796
2,,1,Call Center in Tokio city,Himilsbach's st.,21,38,-72.3618,51.0793,CALHIM2138


In [10]:
# Create csv file with given destination

df.columns = column_list
df.to_csv(path_to_file, index = None, header = True)