# Data Engineering 1: Lab 06 - Solution
---------------

### Task 01: Consistent Hashing
#### Fill the parts marked with Todo for a complete implementation of Consistent Hashing

In [1]:
!pip3 install names



In [7]:
import names
import pandas as pd
import numpy as np
from collections import Counter

n_names = 10000
n_servers = 3

#returns list of size n with random names
def getData(n):
    data = []
    i = 0
    while i < n:
        data.append(names.get_first_name())
        i += 1
    return data

#returns list of hash values for list of data
def getHashs(data):
    hashs = []
    for word in data:
        hashs.append(abs(hash(word)) % (10 ** 8))
    return hashs

#returns list of size m with random server names
def getServers(m):
    data = []
    i = 0
    while i < m:
        data.append('Server_' + names.get_first_name())
        i += 1
    return data

#TodO: create a DataFrame with the list created with the function getdata(100) 
# and a type column filled with the type key
data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for server in servers:
    df2 = pd.DataFrame({'name': server, 'hashvalue': abs(hash(server)) % (10 ** 8), 'type': 'server'}, index=[df.shape[0]])
    df = pd.concat([df,df2])
    #df = df.append(df2, ignore_index = True)

#Todo: add a column to the DataFrame with the angle on the circle
def getAngles(hashs):
    angles = []
    maxh = max(hashs)
    for hashi in hashs:
        angle = (hashi / maxh) * 360.0
        angles.append(angle)
    return angles
        
angles = getAngles(df['hashvalue'])
df['angle'] = angles

#Todo: add a column to the DataFrame with the corresponding server
def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    return server_names[a == a.min()][0]
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    df['server'] = server_assotiated
    return server_assotiated

server_assotiated = get_server_assotiation(df)
df['server'] = server_assotiated

df.sort_values(by=['angle'], inplace=True)
print('Unequal distribution:')
print(Counter(df.server.values))
#Todo: print the final DataFrame
df


Unequal distribution:
Counter({'Server_Ruth': 4100, 'Server_Janice': 3498, 'Server_Carl': 2405})


Unnamed: 0,name,type,hashvalue,angle,server
1576,Charlsie,key,153817,0.553934,Server_Janice
9853,Danny,key,190097,0.684587,Server_Janice
4998,Danny,key,190097,0.684587,Server_Janice
6603,Danny,key,190097,0.684587,Server_Janice
14,Danny,key,190097,0.684587,Server_Janice
...,...,...,...,...,...
2523,Harvey,key,99946433,359.932181,Server_Janice
7143,Harvey,key,99946433,359.932181,Server_Janice
8620,Harvey,key,99946433,359.932181,Server_Janice
7469,Colby,key,99965265,360.000000,Server_Janice


### Task 02: Balanced Consistent Hashing
#### Extend the code from Task 01 to ensure object keys are evenly distributed among servers

In [11]:
import names
import pandas as pd
from collections import Counter

#returns list of size n with random names
def getData(n):
    data = []
    i = 0
    while i < n:
        data.append(names.get_first_name())
        i += 1
    return data

#returns list of hash values for list of data
def getHashs(data, granulat):
    hashs = []
    for word in data:
        hashs.append(abs(hash(word)) % (granulat))
    return hashs

#returns list of size m with random server names
def getServers(m):
    data = []
    i = 0
    while i < m:
        data.append('Server_' + names.get_first_name())
        i += 1
    return data

n_names = 10
n_servers = 3
granulat = 10 ** 5

data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data, granulat)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for i in range(len(servers)):
    server = servers[i]
    df2 = pd.DataFrame({'name': server, 'hashvalue': int(i*granulat/n_servers + 1), 'type': 'server'}, index=[df.shape[0]])
    df = pd.concat([df,df2])

#Todo: add a column to the DataFrame with the angle on the circle
def getAngles(hashs):
    angles = []
    maxh = max(hashs)
    for hashi in hashs:
        angle = (hashi / maxh) * 360.0
        angles.append(angle)
    return angles
        
angles = getAngles(df['hashvalue'])
df['angle'] = angles

def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    return server_names[a == a.min()][0]
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    df['server'] = server_assotiated
    return server_assotiated


server_assotiated = get_server_assotiation(df)
df['server'] = server_assotiated


df.sort_values(by=['angle'], inplace=True)
print('Equal distribution:')
print(Counter(df.server.values))
#Todo: print the final DataFrame
df

Equal distribution:
Counter({'Server_Gilbert': 5, 'Server_Charles': 4, 'Server_Richard': 4})


Unnamed: 0,name,type,hashvalue,angle,server
10,Server_Charles,server,1,0.003617,Server_Charles
6,Jerry,key,7657,27.693142,Server_Charles
1,Sherman,key,13979,50.557978,Server_Charles
11,Server_Richard,server,33334,120.559384,Server_Richard
8,Joan,key,39073,141.315678,Server_Richard
7,Paul,key,43703,158.061042,Server_Richard
2,Rose,key,44601,161.308847,Server_Richard
0,Barbara,key,50283,181.858989,Server_Gilbert
5,Douglas,key,50341,182.068758,Server_Gilbert
4,Patrick,key,60329,218.192449,Server_Gilbert


In [52]:
import names
import pandas as pd
from collections import Counter

#returns list of size n with random names
def getData(n):
    data = []
    i = 0
    while i < n:
        data.append(names.get_first_name())
        i += 1
    return data

#returns list of hash values for list of data
def getHashs(data, granulat):
    hashs = []
    for word in data:
        hashs.append(abs(hash(word)) % (granulat))
    return hashs

#returns list of size m with random server names
def getServers(m):
    data = []
    i = 0
    multiply = 3
    while i < m:
        j = 0
        sname = names.get_first_name()
        while j < multiply:
            data.append('Server_' + str(j) + '_' + sname)
            j += 1
        i += 1
    return data

n_names = 10
n_servers = 3
granulat = 10 ** 5

data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'
df['label'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data, granulat)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for i in range(len(servers)):
    server = servers[i]
    sname = server.split('_')[2]
    df2 = pd.DataFrame({'name': server, 'hashvalue': abs(hash(server)) % (10 ** 8), 'type': 'server', 'label': sname}, index=[df.shape[0]])
    df = pd.concat([df,df2])

#Todo: add a column to the DataFrame with the angle on the circle
def getAngles(hashs):
    angles = []
    maxh = max(hashs)
    for hashi in hashs:
        angle = (hashi / maxh) * 360.0
        angles.append(angle)
    return angles
        
angles = getAngles(df['hashvalue'])
df['angle'] = angles

def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    return server_names[a == a.min()][0]
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    df['server'] = server_assotiated
    return server_assotiated

server_assotiated = get_server_assotiation(df)
df['server'] = server_assotiated

labels = []
for index, row in df.iterrows():
    labels.append(row['server'].split('_')[2])

print(labels)

df['label'] = labels
df
#df.sort_values(by=['angle'], inplace=True)
#print('Equal distribution:')
#print(Counter(df.label.values))
#df.groupby(['label']).size()
#Todo: print the final DataFrame

['Eugene', 'Eugene', 'Eugene', 'Eugene', 'Eugene', 'Eugene', 'Eugene', 'Eugene', 'Eugene', 'Eugene', 'Harlan', 'Harlan', 'Harlan', 'Eugene', 'Eugene', 'Eugene', 'Andrea', 'Andrea', 'Andrea']


Unnamed: 0,name,type,label,hashvalue,angle,server
0,Benny,key,Eugene,97333,0.437941,Server_1_Eugene
1,Richard,key,Eugene,53587,0.24111,Server_1_Eugene
2,Charles,key,Eugene,44437,0.19994,Server_1_Eugene
3,Gordon,key,Eugene,47739,0.214797,Server_1_Eugene
4,Wendi,key,Eugene,80126,0.360519,Server_1_Eugene
5,Charles,key,Eugene,44437,0.19994,Server_1_Eugene
6,Christy,key,Eugene,50709,0.22816,Server_1_Eugene
7,Judith,key,Eugene,31871,0.143401,Server_1_Eugene
8,Louisa,key,Eugene,76086,0.342342,Server_1_Eugene
9,Brian,key,Eugene,65183,0.293285,Server_1_Eugene


### Task 03: Weighted Balanced Consistent Hashing
#### Extend the code from Task 01 to ensure object keys are evenly distributed among servers but also with a weighting of the single servers.

In [48]:
n_names = 10
n_servers = 3
granulat = 10 ** 5

data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data, granulat)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for i in range(len(servers)):
    server = servers[i]
    df2 = pd.DataFrame({'name': server, 'hashvalue': int(i*granulat/n_servers + 1), 'type': 'server'}, index=[df.shape[0]])
    df = pd.concat([df,df2])
    
angles = getAngles(df['hashvalue'])
df['angle'] = angles

def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    
    output = list(server_names[a == a.min()])
    
    weights = 1/a
    weights = weights/weights.sum()
    output.extend([item for item in weights])
    return tuple(output)
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    columns = ['server']
    columns.extend(list(server_names))
    df1 = pd.DataFrame(server_assotiated, columns = columns, index=df.index)
    df  = pd.concat([df,df1], axis=1)
    return df


df = get_server_assotiation(df)
#df['server'] = server_assotiated

df.sort_values(by=['angle'], inplace=True)
df.replace({np.nan:1})
print('Equal distribution:')
print(Counter(df.server.values))
#Todo: print the final DataFrame
df

Equal distribution:
Counter({'Server_2_Katie': 5, 'Server_1_Katie': 4, 'Server_0_Joe': 4, 'Server_0_Katie': 1, 'Server_1_Joe': 1, 'Server_2_Joe': 1, 'Server_0_Jordan': 1, 'Server_1_Jordan': 1, 'Server_2_Jordan': 1})


  weights = 1/a
  weights = weights/weights.sum()


Unnamed: 0,name,type,hashvalue,angle,server,Server_0_Katie,Server_1_Katie,Server_2_Katie,Server_0_Joe,Server_1_Joe,Server_2_Joe,Server_0_Jordan,Server_1_Jordan,Server_2_Jordan
10,Server_0_Katie,server,1,0.00135,Server_0_Katie,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,Frank,key,24599,33.208608,Server_1_Katie,0.146099,0.411418,0.085427,0.047661,0.03305,0.028842,0.039377,0.062034,0.146093
11,Server_1_Katie,server,33334,45.000844,Server_1_Katie,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Mary,key,49718,67.119216,Server_1_Katie,0.090892,0.275811,0.266617,0.089869,0.054043,0.03864,0.038827,0.054411,0.09089
7,Mary,key,49718,67.119216,Server_1_Katie,0.090892,0.275811,0.266617,0.089869,0.054043,0.03864,0.038827,0.054411,0.09089
6,Teresa,key,51160,69.065914,Server_2_Katie,0.088327,0.25349,0.291398,0.092519,0.05499,0.039121,0.038351,0.05348,0.088325
3,Christopher,key,51635,69.707163,Server_2_Katie,0.087345,0.246432,0.300024,0.093246,0.055202,0.039206,0.038123,0.053078,0.087343
2,Sonia,key,56718,76.569204,Server_2_Katie,0.072557,0.175985,0.413632,0.095077,0.053712,0.037429,0.033353,0.045699,0.072556
12,Server_2_Katie,server,66667,90.000337,Server_2_Katie,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
9,Angela,key,70826,95.61498,Server_2_Katie,0.037813,0.07143,0.643922,0.091793,0.042844,0.027943,0.020732,0.025711,0.037812
