In [7]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Purchases per Customer zählt
"""
@ray.remote
def count_customer_orders(s):
    counter = Counter()
    customer_order = None
    for record in s.iter_rows():
        if customer_order == None:
            customer_order = (record["CustomerId"], record["OrderId"])
        elif customer_order[1] != record["OrderId"]:
            counter.update({customer_order[0]: 1})
            customer_order = (record["CustomerId"], record["OrderId"])
        
    counter.update({customer_order[0]: 1})
    """
    dictionary = {}

    for record in s.iter_rows():
        if record["CustomerId"] in dictionary:
            dictionary[record["CustomerId"]] = dictionary.get(record["CustomerId"]) + 1
        else:
            dictionary[record["CustomerId"]] = 1
    
    counter = Counter(dictionary)
    """
    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche die Top 10 Customer findet
"""
@ray.remote
def find_top_10_customers(customers):
    counter = Counter(customers)
    return counter.most_common(10)

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
customer_lists = []
for split in splits:
    x = count_customer_orders.remote(split)
    customer_lists.append(x)
    
print(len(customer_lists))

while len(customer_lists) > 1:
    customer_lists = customer_lists[2:] + [add_counters.remote(customer_lists[0], customer_lists[1])]

#print(ray.get(customers))
ys = find_top_10_customers.remote(customer_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351, 880703, 1321055, 1761407, 2201759, 2642111, 3082463, 3522815, 3963167, 4403519, 4843871]
Functioning
12
Getting
[(12835, 12656), (15, 11144), (9729, 10336), (10591, 9278), (14, 9117), (5659, 9095), (55, 8914), (2581, 8561), (12823, 8486), (12708, 8424)]
5284223
duration:  86.44616365432739


In [2]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Anzahl an gekauften Produkten pro Monat zählt
"""
@ray.remote
def count_product_month(s):
    counter = Counter()
    for record in s.iter_rows():
        month = record["OrderDate"].strftime("%Y%m")
        counter.update({month: record["Quantity"]})
    """
    dictionary = {}

    for record in s.iter_rows():
        if record["CustomerId"] in dictionary:
            dictionary[record["CustomerId"]] = dictionary.get(record["CustomerId"]) + 1
        else:
            dictionary[record["CustomerId"]] = 1
    
    counter = Counter(dictionary)
    """
    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche den Monat mit den Top Verkäufen findet
"""
@ray.remote
def find_top_month(month):
    counter = Counter(month)
    return counter.most_common(1)

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
month_quantity_lists = []
for split in splits:
    x = count_product_month.remote(split)
    #print(ray.get(x))
    month_quantity_lists.append(x)

while len(month_quantity_lists) > 1:
    month_quantity_lists = month_quantity_lists[2:] + [add_counters.remote(month_quantity_lists[0], month_quantity_lists[1])]

ys = find_top_month.remote(month_quantity_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351, 880703, 1321055, 1761407, 2201759, 2642111, 3082463, 3522815, 3963167, 4403519, 4843871]
Functioning
Getting
[('201711', 75432926.97999977)]
5284223
duration:  130.55493474006653


In [3]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Anzahl an gekauften Produkten pro Jahr zählt
"""
@ray.remote
def count_product_year(s):
    counter = Counter()
    for record in s.iter_rows():
        product_year = (record["OrderDate"].strftime("%Y"), record["ProductId"] )
        counter.update({product_year: int(record["Quantity"])})

    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche die Produkte mit den wenigsten Bestellungen findet
"""
@ray.remote
def find_worst_products_per_year(product_years):
    counter = Counter(product_years)
    least_bought_products_year = []
    currentyear = None
    for (year, product), count in sorted(counter.items()):
        if currentyear == None:
            ct = Counter()
            ct.update((product, count))
            currentyear = (year, ct)
        elif currentyear[0] == year:
            ct = Counter(currentyear [1])
            ct[product] += count
            currentyear = (year, ct)
        else:
            ct = Counter(currentyear[1])
            least_bought_products_year.append((currentyear[0], ct.most_common()[:-n-1:-1]))
            ct = Counter({product, count})
            currentyear = (year, ct)
    
    if currentyear != None:
            ct = Counter(currentyear[1])
            least_bought_products_year.append((currentyear[0], ct.most_common()[:-n-1:-1]))
    
    """
        if years.get(year, False):
            ct = Counter(years.get(key[1]))
            ct.update(product, count)
            years.append(year, ct)
        else:
            ct = Counter()
            ct.update({product, count})
            years.update({year, ct.items()})
            
            
        for year in years.items():
            ct = Counter(year)
            least_bought_products_year.append((year, ct.most_common()[:-n-1:-1]))
    """                          
    return least_bought_products_year

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
year_quantity_lists = []
for split in splits:
    x = count_product_year.remote(split)
    #print(ray.get(x))
    year_quantity_lists.append(x)

while len(year_quantity_lists) > 1:
    year_quantity_lists = year_quantity_lists[2:] + [add_counters.remote(year_quantity_lists[0], year_quantity_lists[1])]

ys = find_worst_products_per_year.remote(year_quantity_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351, 880703, 1321055, 1761407, 2201759, 2642111, 3082463, 3522815, 3963167, 4403519, 4843871]
Functioning
Getting
[('2015', [(53, 1), (48222, 1), (116532, 191)]), ('2016', [(158083, 1), (140594, 1), (140593, 1), (140559, 1), (140533, 1), (140499, 1), (140140, 1), (140138, 1), (139887, 1), (139563, 1), (137258, 1), (131949, 1)]), ('2017', [(171896, 1), (171895, 1), (171879, 1), (171876, 1), (158936, 1), (156619, 1), (156549, 1), (156382, 1), (155896, 1), (155720, 1), (155513, 1), (155512, 1)]), ('2018', [(172253, 1), (172252, 1), (172251, 1), (172245, 1), (172238, 1), (172237, 1), (172233, 1), (172228, 1), (172227, 1), (172226, 1), (172224, 1), (172222, 1)])]
5284223
duration:  1206.540923833847


In [4]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Anzahl an Bestellungen für ein Produkt pro Jahr zählt
"""
@ray.remote
def count_productorders_year(s):
    counter = Counter()
    for record in s.iter_rows():
        productorders_year = (record["OrderDate"].strftime("%Y"), record["ProductId"] )
        counter.update({productorders_year: 1})

    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche die Produkte mit den wenigsten Bestellungen findet
"""
@ray.remote
def find_worst_products_per_year(product_years):
    counter = Counter(product_years)
    least_bought_products_year = []
    currentyear = None
    for (year, product), count in sorted(counter.items()):
        if currentyear == None:
            ct = Counter()
            ct.update((product, count))
            currentyear = (year, ct)
        elif currentyear[0] == year:
            ct = Counter(currentyear[1])
            ct[product] += count
            currentyear = (year, ct)
        else:
            ct = Counter(currentyear[1])
            least_bought_products_year.append((currentyear[0], ct.most_common()[:-n-1:-1]))
            ct = Counter({product, count})
            currentyear = (year, ct)
    
    if currentyear != None:
            ct = Counter(currentyear[1])
            least_bought_products_year.append((currentyear[0], ct.most_common()[:-n-1:-1]))
                   
    return least_bought_products_year

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
year_order_amount_lists = []
for split in splits:
    x = count_productorders_year.remote(split)
    #print(ray.get(x))
    year_order_amount_lists.append(x)

while len(year_order_amount_lists) > 1:
    year_order_amount_lists = year_order_amount_lists[2:] + [add_counters.remote(year_order_amount_lists[0], year_order_amount_lists[1])]

ys = find_worst_products_per_year.remote(year_order_amount_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351, 880703, 1321055, 1761407, 2201759, 2642111, 3082463, 3522815, 3963167, 4403519, 4843871]
Functioning
Getting
[('2015', [(116532, 1), (1, 1), (48222, 1)]), ('2016', [(158083, 1), (155906, 1), (150304, 1), (150303, 1), (150302, 1), (140594, 1), (140593, 1), (140566, 1), (140559, 1), (140558, 1), (140556, 1), (140538, 1)]), ('2017', [(172086, 1), (172024, 1), (172002, 1), (172001, 1), (171999, 1), (171905, 1), (171896, 1), (171895, 1), (171887, 1), (171886, 1), (171885, 1), (171884, 1)]), ('2018', [(172253, 1), (172252, 1), (172251, 1), (172250, 1), (172249, 1), (172248, 1), (172245, 1), (172244, 1), (172243, 1), (172242, 1), (172241, 1), (172238, 1)])]
5284223
duration:  1053.9904515743256


In [5]:
import ray
from collections import Counter
import time
import datetime


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Anzahl an Orders für ein Produkt in der Vorweihnachtszeit zählt
"""
@ray.remote
def count_product_year(s):
    counter = Counter()
    for record in s.iter_rows():
        orderdate = datetime.datetime.strptime(record["OrderDate"].strftime("%Y-%m-%d"), "%Y-%m-%d")
        start = datetime.datetime(day=16,month=11,year=int(orderdate.strftime("%Y")))
        end = datetime.datetime(day=23,month=12,year=int(orderdate.strftime("%Y")))
        if start <= orderdate <= end:    
            product_year = (orderdate.strftime("%Y"), record["ProductId"] )
            counter.update({product_year: 1})

    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche die Produkte mit den meisten Bestellungen im Jahr findet
Eingabewerte sind alle nur in der Vorweihnachtszeit
"""
@ray.remote
def find_best_products_per_year(product_years):
    counter = Counter(product_years)
    least_bought_products_year = []
    currentyear = None
    for (year, product), count in sorted(counter.items()):
        if currentyear == None:
            ct = Counter()
            ct.update((product, count))
            currentyear = (year, ct)
        elif currentyear[0] == year:
            ct = Counter(currentyear [1])
            ct[product] += count
            currentyear = (year, ct)
        else:
            ct = Counter(currentyear[1])
            least_bought_products_year.append((currentyear[0], ct.most_common(3)))
            ct = Counter({product, count})
            currentyear = (year, ct)
    
    if currentyear != None:
            ct = Counter(currentyear[1])
            least_bought_products_year.append((currentyear[0], ct.most_common(3)))
                   
    return least_bought_products_year

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
year_quantity_lists = []
for split in splits:
    x = count_product_year.remote(split)
    #print(ray.get(x))
    year_quantity_lists.append(x)

while len(year_quantity_lists) > 1:
    year_quantity_lists = year_quantity_lists[2:] + [add_counters.remote(year_quantity_lists[0], year_quantity_lists[1])]

ys = find_best_products_per_year.remote(year_quantity_lists[0])
print("Getting")
ys = ray.get(ys)

print(ys)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351, 880703, 1321055, 1761407, 2201759, 2642111, 3082463, 3522815, 3963167, 4403519, 4843871]
Functioning
Getting
[('2016', [(22307, 818), (21346, 302), (22275, 258)]), ('2017', [(22307, 1199), (21346, 981), (1289, 372)]), ('2018', [(22307, 643), (40, 437), (682, 408)])]
5284223
duration:  359.08482098579407


In [21]:
import ray
from collections import Counter
import time


print("Testrun")

ds = ray.data.read_csv("../sales-data.csv")
print("Input read: ", ds)

"""
Function, welche die Menge an gekauften Produkten pro Jahr zählt
"""
@ray.remote
def count_product_year(s):
    counter = Counter()
    for record in s.iter_rows():
        year = int(record["OrderDate"].strftime("%Y"))
        counter.update({year: int(record["Quantity"])})

    return counter

"""Function, welche Counter addiert"""
@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2


"""
Function, welche die Wachstumsraten berechnet (Streng gesehen noch -100% zu nehmen)
"""
@ray.remote
def calculate_growthrate(product_amount_year):
    counter = Counter(product_amount_year)
    year_amount = sorted(counter.items())
    old_year = None
    growth_rates = []
    for year, amount in year_amount:
        if old_year == None:
            old_year = (year, amount, "100%", ("previous", None))
        elif old_year[0] != year:
            growth_rates.append(old_year)
            old_year = (year, amount, str(int((amount/old_year[1])*100)) + "%", ("previous", old_year[1]))

    growth_rates.append(old_year)
    return growth_rates

start = time.time()

print("Splitting")
split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))
    
    
print(split_indices)
    
splits = ds.split_at_indices(split_indices)
print("Functioning")
year_quantity_lists = []
for split in splits:
    x = count_product_year.remote(split)
    #print(ray.get(x))
    year_quantity_lists.append(x)

while len(year_quantity_lists) > 1:
    year_quantity_lists = year_quantity_lists[2:] + [add_counters.remote(year_quantity_lists[0], year_quantity_lists[1])]

growthrates = calculate_growthrate.remote(year_quantity_lists[0])
print("Getting")
growthrates = ray.get(growthrates)

print(growthrates)


print(ds.count())

print('duration: ',time.time() - start)

Testrun
Input read:  Dataset(num_blocks=1, num_rows=None, schema={CustomerId: int64, OrderId: int64, ProductId: int64, ProductGroupId: int64, Quantity: double, OrderDate: date32[day]})
Splitting
[440351, 880703, 1321055, 1761407, 2201759, 2642111, 3082463, 3522815, 3963167, 4403519, 4843871]
Functioning
Getting
[(2015, 244, '100%', ('previous', None)), (2016, 32455923, '13301607%', ('previous', 244)), (2017, 800167653, '2465%', ('previous', 32455923)), (2018, 732047719, '91%', ('previous', 800167653))]
5284223
duration:  240.67668509483337
