In [25]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Purchases per Customer zählt
"""
@ray.remote
def count_customers(s):
    counter = Counter()
    for record in s.iter_rows():
        counter.update({record["CustomerId"]: 1})
    """
    dictionary = {}

    for record in s.iter_rows():
        if record["CustomerId"] in dictionary:
            dictionary[record["CustomerId"]] = dictionary.get(record["CustomerId"]) + 1
        else:
            dictionary[record["CustomerId"]] = 1
    
    counter = Counter(dictionary)
    """
    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche die Top 10 Customer findet
"""
@ray.remote
def find_top_10_customers(customers):
    counter = Counter(customers)
    return counter.most_common(10)

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(i*ds.count()/n)
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
customer_lists = []
for split in splits:
    x = count_customers.remote(split)
    #print(ray.get(x))
    customer_lists.append(x)

while len(customer_lists) > 1:
    customer_lists = customer_lists[2:] + [add_counters.remote(customer_lists[0], customer_lists[1])]

#print(ray.get(customers))
ys = find_top_10_customers.remote(customer_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351.9166666667, 880703.8333333334, 1321055.75, 1761407.6666666667, 2201759.5833333335, 2642111.5, 3082463.4166666665, 3522815.3333333335, 3963167.25, 4403519.166666667, 4843871.083333333]
Functioning
Getting
[(5659, 33495), (12836, 32553), (2581, 31195), (12835, 31083), (3851, 30126), (12823, 28934), (9729, 28803), (12788, 28147), (10591, 25964), (47, 24517)]
5284223
duration:  86.36910152435303


In [13]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Anzahl an gekauften Produkten pro Monat zählt
"""
@ray.remote
def count_product_month(s):
    counter = Counter()
    for record in s.iter_rows():
        month = record["OrderDate"].strftime("%Y%m")
        counter.update({month: record["Quantity"]})
    """
    dictionary = {}

    for record in s.iter_rows():
        if record["CustomerId"] in dictionary:
            dictionary[record["CustomerId"]] = dictionary.get(record["CustomerId"]) + 1
        else:
            dictionary[record["CustomerId"]] = 1
    
    counter = Counter(dictionary)
    """
    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche den Monat mit dem Top verkauf findet
"""
@ray.remote
def find_top_month(month):
    counter = Counter(month)
    return counter.most_common(1)

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(i*ds.count()/n)
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
month_quantity_lists = []
for split in splits:
    x = count_product_month.remote(split)
    #print(ray.get(x))
    month_quantity_lists.append(x)

while len(month_quantity_lists) > 1:
    month_quantity_lists = month_quantity_lists[2:] + [add_counters.remote(month_quantity_lists[0], month_quantity_lists[1])]

ys = find_top_month.remote(month_quantity_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351.9166666667, 880703.8333333334, 1321055.75, 1761407.6666666667, 2201759.5833333335, 2642111.5, 3082463.4166666665, 3522815.3333333335, 3963167.25, 4403519.166666667, 4843871.083333333]
Functioning
Getting
[('201711', 75432926.97999977)]
5284223
duration:  125.67712020874023


In [32]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../testset.csv")
print("Input read: ", ds)

"""
Function, welche die Anzahl an gekauften Produkten pro Jahr zählt
"""
@ray.remote
def count_product_year(s):
    counter = Counter()
    for record in s.iter_rows():
        product_year = (record["ProductId"] , record["OrderDate"].strftime("%Y"))
        counter.update({product_year: int(record["Quantity"])})
    """
    dictionary = {}

    for record in s.iter_rows():
        if record["CustomerId"] in dictionary:
            dictionary[record["CustomerId"]] = dictionary.get(record["CustomerId"]) + 1
        else:
            dictionary[record["CustomerId"]] = 1
    
    counter = Counter(dictionary)
    """
    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche den Monat mit dem Top verkauf findet
"""
@ray.remote
def find_worst_products_per_year(product_years):
    counter = Counter(product_years)
    years = {}
    for (product, year), count in counter.items():
        if years.get(year, False):
            ct = Counter(years.get(key[1]))
            ct.update(product, count)
            years.update(year, ct)
        else:
            ct = Counter()
            ct.update({product, count})
            years.update({year, ct.items()})
            
    least_bought_products_year = []
    for year in years.items():
        ct = Counter(year)
        least_bought_products_year.append((year, ct.most_common()[:-n-1:-1]))
                                          
    return least_bought_products_year

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(i*ds.count()/n)
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
year_quantity_lists = []
for split in splits:
    x = count_product_year.remote(split)
    #print(ray.get(x))
    year_quantity_lists.append(x)

while len(month_quantity_lists) > 1:
    year_quantity_lists = year_quantity_lists[2:] + [add_counters.remote(year_quantity_lists[0], year_quantity_lists[1])]

ys = find_worst_products_per_year.remote(year_quantity_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[83333.25, 166666.5, 249999.75, 333333.0, 416666.25, 499999.5, 583332.75, 666666.0, 749999.25, 833332.5, 916665.75]
Functioning
Getting


RayTaskError(TypeError): [36mray::find_worst_products_per_year()[39m (pid=1043, ip=10.0.2.15)
  File "<ipython-input-32-178fbd65fe44>", line 56, in find_worst_products_per_year
TypeError: unhashable type: 'dict_items'