In [15]:
import os
import numpy    as np
import pandas   as pd
import psycopg2 as psy

from datetime import datetime as dt
from random   import randrange
from random   import sample
from random   import choice

In [2]:
#''' Check if file exists on path. Move existing file to ARCH. Create new input for client_dim.'''


In [20]:
# Setup script input parameters

min_obs = 5                      # minimal number of observation in output dataset
max_obs = 20                     # maximal number of observation in output dataset

min_age = 18                     # minimal age of client
max_age = 90                     # maximal age of client

obs = randrange(min_obs,max_obs) # number of observation
obs

19

In [32]:
main_path    = "Documents/CallCenterStaffing"
path_to_arch = "Arch/client"
path_to_file = "Input"
path_to_dict = "Dict"
file_name    = "client_dim.csv"
name_dict    = "name_dict.csv"
now          = dt.now()
path_to_file = os.path.join(os.path.expanduser("~"),main_path,path_to_file,file_name)
path_to_arch = os.path.join(os.path.expanduser("~"),main_path,path_to_arch)
name_dict    = os.path.join(os.path.expanduser("~"),main_path,path_to_dict,name_dict)

if not(os.path.exists(path_to_file)):
     raise Exception("Input file doesn't exist")

if not(os.path.isdir(path_to_arch)):
    raise Exception("Archive directory doesn't exist")
        
if not(os.path.isdir(path_to_dict)):
    raise Exception("Dictionary doesn't exist")
        
time_stamp = "_" + str(now)[:10]
arch_file_name = 'arch_' + file_name.replace(".csv",time_stamp + ".csv")
 
os.rename(path_to_file, os.path.join(path_to_arch, arch_file_name))

In [24]:
# connection to database and return list of column column_list
try:
    connection = psy.connect( user = "szymonbocian",
                              password = "",
                              host = "localhost",
                              port = "5432",
                              database = "dwh_call_center")

    cursor = connection.cursor()

    cursor.execute("""
        SELECT column_name 
        FROM INFORMATION_SCHEMA.columns 
        WHERE table_schema = 'stg' AND table_name = 'client_dim';
    """)
    column_list = cursor.fetchall()
    
except (Exception, psy.Error) as error :
    print ("Error while connecting to PostgreSQL", error)
finally:
        if(connection):
            cursor.close()
            connection.close()

column_list = [''.join(c for c in s if c not in '(),') for s in column_list]
print(column_list)

['client_first_name', 'client_second_name', 'client_last_name', 'sex_pl', 'sex_eng', 'sex_shortcut_pl', 'sex_shortcut_eng', 'client_age', 'client_business_key']


In [44]:
# Load dictionary with first_name (second_name), last_name
dict_name = pd.read_csv(name_dict,
                        sep = ";",
                        usecols = ['male_name','female_name','last_name'],
                        skiprows = 0
                       )

/Users/szymonbocian/Documents/CallCenterStaffing/Dict/name_dict.csv


Unnamed: 0,male_name,female_name,last_name
0,Aaron,Ada,Abakanowicz
1,Adam,Adela,Abramczyk
2,Adrian,Adrianna,Adamiec
3,Alan,Agata,Adamiuk
4,Albert,Agnieszka,Adamkiewicz


In [61]:
f = pd.concat([dict_name.male_name,dict_name.female_name])

df = pd.DataFrame({
    "firstName" : sample(tuple(f), k = obs)
})

# df['isSecName'] = np.random.choice([True, False], obs, True, [0.15, 0.85])
# df['isMale']   = df.firstName.isin(A)
# df['isFemale'] = df.firstName.isin(B)

# df['secondName'] = df.apply(lambda l: choice(A) if l['isMale']   and l['isSecName'] else "", axis = 1)
# df['secondName'] = df.apply(lambda l: choice(B) if l['isFemale'] and l['isSecName'] else l['secondName'], axis = 1)

# df['lastName'] = sample(D, k = obs)

# df['sexPl']  = df.apply(lambda l: "Mężczyzna" if l['isMale'] else "Kobieta", axis = 1)
# df['sexEng'] = df.apply(lambda l: "Male"      if l['isMale'] else "Female", axis = 1)

# df['sexShortcutPl']  = df.apply(lambda l: "M" if l['isMale'] else "K", axis = 1)
# df['sexShortcutEng'] = df.apply(lambda l: "M" if l['isMale'] else "F", axis = 1)

# df['clientAge'] = pd.Series(np.round(np.random.uniform(min_age,max_age,obs),0)).astype(int)
df

Unnamed: 0,firstName
0,Martin
1,Paulina
2,Cyprian
3,Selena
4,Norbert
5,Amanda
6,Alan
7,Otylia
8,Maya
9,Arleta


In [117]:
# Create client business key 
fnl = ('00' + (df.firstName.str.len() *8).astype(str))
lnl = ('00' + (df.lastName.str.len() *8).astype(str))
cal = ('00' + df.clientAge.astype(str))

df['clientBusinessKey'] = df.firstName.map(str).str.slice(0,1) + df.lastName.map(str).str.slice(0,1) + [w[-3:] for w in fnl] + [w[-3:] for w in lnl] + [w[-3:] for w in cal] + df.sexShortcutPl

In [118]:
df = df.drop(['isSecName', 'isMale', 'isFemale'], axis = 1)
df

Unnamed: 0,firstName,secondName,lastName,sexPl,sexEng,sexShortcutPl,sexShortcutEng,clientAge,clientBusinessKey
0,Karol,,Olsen,Mężczyzna,Male,M,M,85,KO040040085M
1,Oskar,,Erwin,Mężczyzna,Male,M,M,44,OE040040044M
2,Jeremi,,Glemp,Mężczyzna,Male,M,M,23,JG048040023M
3,Małgorzata,,Lang,Kobieta,Female,K,F,47,ML080032047K
4,Erazm,,Alef,Mężczyzna,Male,M,M,59,EA040032059M
5,Bartłomiej,Maciej,Pferdburg,Mężczyzna,Male,M,M,36,BP080072036M
6,Jadwiga,Celina,Deleng,Kobieta,Female,K,F,32,JD056048032K
7,Halina,,Czarny,Kobieta,Female,K,F,71,HC048048071K


In [119]:
# create csv file with given destination

df.columns = column_list
df.to_csv(path_to_file, index = None, header = True)