In [1]:
import ray
ray.init()

2022-01-29 15:22:31,683	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:6379',
 'object_store_address': 'tcp://127.0.0.1:55276',
 'raylet_socket_name': 'tcp://127.0.0.1:54799',
 'webui_url': '127.0.0.1:8265',
 'session_dir': 'C:\\Users\\Florian\\AppData\\Local\\Temp\\ray\\session_2022-01-29_15-22-27_879593_10108',
 'metrics_export_port': 60006,
 'node_id': '768d6af8f2beb0d66c1910df05c1dae7bc0592d3ad985ad530ed4712'}

In [2]:
%%time
import ray

""" Read the data """
ds = ray.data.read_csv("../sales-data.csv")

CPU times: total: 62.5 ms
Wall time: 2.79 s


# Top 10 Customers

## Ray Datasets

In [4]:
%%time
import ray
from collections import Counter

""" Customer Top 10 """
@ray.remote
def count_customer_orders(s):
    counter = Counter()
    customer_order = None
    for record in s.iter_rows():
        if customer_order == None:
            customer_order = (record["CustomerId"], record["OrderId"])
        elif customer_order[1] != record["OrderId"]:
            counter.update({customer_order[0]: 1})
            customer_order = (record["CustomerId"], record["OrderId"])
    counter.update({customer_order[0]: 1})
    return counter

@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2

@ray.remote
def find_top_10_customers(customers):
    counter = Counter(customers)
    return counter.most_common(10)

split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))

splits = ds.split_at_indices(split_indices)
customer_lists = []
for split in splits:
    x = count_customer_orders.remote(split)
    customer_lists.append(x)

while len(customer_lists) > 1:
    customer_lists = customer_lists[2:] + [add_counters.remote(customer_lists[0], customer_lists[1])]

ys = find_top_10_customers.remote(customer_lists[0])
ray.get(ys)

CPU times: total: 172 ms
Wall time: 24 s


[(12835, 12656),
 (15, 11144),
 (9729, 10336),
 (10591, 9278),
 (14, 9117),
 (5659, 9095),
 (55, 8914),
 (2581, 8561),
 (12823, 8486),
 (12708, 8424)]

## Plain Python implementation

In [5]:
%%time
def read_dataset(filename):
    file = open(filename, 'r')
    return file.readlines()

lines = read_dataset("../sales-data.txt")

CPU times: total: 1.2 s
Wall time: 1.83 s


In [8]:
%%time
from collections import Counter

class CustomerCounter:
    def __init__(self):
        self.customers = {}

    def add(self, customer, order):
        if customer in self.customers:
            self.customers[customer].add(order)
        else:
            self.customers[customer] = set(order)

    def get_count(self):
        counted = {}
        for key in self.customers:
            counted[key] = len(self.customers[key])
        return Counter(counted)

def find_top_10_customers(lines):
    customer_counter = CustomerCounter()
    for i in range(1, len(lines)):
        parts = lines[i].split(';')
        customer_counter.add(parts[0], parts[1])
    print(customer_counter.get_count().most_common(10))

find_top_10_customers(lines)

[('12835', 11811), ('15', 11143), ('9729', 10242), ('14', 9117), ('55', 8917), ('12708', 7919), ('69', 7601), ('12823', 7516), ('16', 7473), ('3493', 7459)]
CPU times: total: 4.73 s
Wall time: 4.88 s


## Ray Actors

In [18]:
%%time
from collections import Counter
import ray

@ray.remote
class CustomerCounter:
    def __init__(self):
        self.customers = {}

    def add(self, customer, order):
        if customer in self.customers:
            self.customers[customer].add(order)
        else:
            self.customers[customer] = set(order)

    def get_count(self):
        for key in self.customers:
            self.customers[key] = len(self.customers[key])
        return Counter(self.customers)

    def process_chunk(self, chunk):
        for line in chunk:
            parts = line.split(';')
            self.add(parts[0], parts[1])

    def merge(self, customer_dict):
        for key in customer_dict:
            if key in self.customers:
                self.customers[key].update(customer_dict[key])
            else:
                self.customers[key] = customer_dict[key]

    def as_dict(self):
        return self.customers


def find_top_10_customers(lines):
    chunks = 4
    chunk_size = int(len(lines) / chunks + 1)
    counters = []
    for i in range(chunks):
        customer_counter = CustomerCounter.remote()
        start_index = i * chunk_size
        chunk = lines[start_index:start_index + chunk_size]
        customer_counter.process_chunk.remote(chunk)
        counters.append(customer_counter)
    counter = counters[0]
    for i in range(1, chunks):
        counter.merge.remote(counters[i].as_dict.remote())
    top10 = counter.get_count.remote()
    print(ray.get(top10).most_common(10))

find_top_10_customers(lines)

[('12835', 11813), ('15', 11148), ('9729', 10247), ('14', 9120), ('55', 8919), ('12708', 7921), ('69', 7603), ('12823', 7518), ('16', 7480), ('3493', 7464)]
CPU times: total: 734 ms
Wall time: 5.92 s


## Pandas

In [3]:
%%time
import pandas as pd

df = pd.read_csv("../sales-data.csv", parse_dates=["OrderDate"])

CPU times: total: 3.66 s
Wall time: 4.11 s


In [33]:
%%time
import pandas as pd

"""Top 10 customers"""
df.drop_duplicates(["CustomerId", "OrderId"]).groupby(by=["CustomerId"]).count().sort_values(by="OrderId", ascending=False).head(10)["OrderId"]

CPU times: total: 719 ms
Wall time: 878 ms


CustomerId
12835    11805
15       11141
9729     10239
14        9116
55        8914
12708     7915
69        7599
12823     7512
16        7471
3493      7455
Name: OrderId, dtype: int64

## Modin Pandas on Ray

In [4]:
%%time
import modin.pandas as mpd

mdf = mpd.read_csv("../sales-data.csv", parse_dates=["OrderDate"])


CPU times: total: 562 ms
Wall time: 4.82 s


In [35]:
%%time

""" Top 10 Customers"""
mdf.drop_duplicates(["CustomerId", "OrderId"]).groupby(by=["CustomerId"]).count().sort_values(by="OrderId", ascending=False).head(10)["OrderId"]



CPU times: total: 9.81 s
Wall time: 37.2 s


To request implementation, send an email to feature_requests@modin.org.


CustomerId
12835    11805
15       11141
9729     10239
14        9116
55        8914
12708     7915
69        7599
12823     7512
16        7471
3493      7455
Name: OrderId, dtype: int64

# Bester Monat

## Ray Datasets

In [41]:
%%time
import ray
from collections import Counter

"""
Anzahl an gekauften Produkten pro Monat
"""
@ray.remote
def count_product_month(s):
    counter = Counter()
    for record in s.iter_rows():
        month = record["OrderDate"].strftime("%Y%m")
        counter.update({month: record["Quantity"]})
    return counter

@ray.remote
def add_counters(fst, snd):
    counter1 = Counter(fst)
    counter2 = Counter(snd)
    return counter1 + counter2

@ray.remote
def find_top_month(month):
    counter = Counter(month)
    return counter.most_common(1)

split_indices = []
n = 12
for i in range(1,n):
    split_indices.append(int(i*ds.count()/n))

splits = ds.split_at_indices(split_indices)
month_quantity_lists = []
for split in splits:
    x = count_product_month.remote(split)
    month_quantity_lists.append(x)

while len(month_quantity_lists) > 1:
    month_quantity_lists = month_quantity_lists[2:] + [add_counters.remote(month_quantity_lists[0], month_quantity_lists[1])]

ys = find_top_month.remote(month_quantity_lists[0])
ray.get(ys)

CPU times: total: 500 ms
Wall time: 47.6 s


[('201711', 75432926.97999977)]

## Pandas

In [48]:
%%time

df["OrderMonth"] = df["OrderDate"].dt.to_period('M')
df.groupby(by='OrderMonth').sum().sort_values(by="Quantity", ascending=False).head(10)["Quantity"]

CPU times: total: 875 ms
Wall time: 1.06 s


OrderMonth
2017-11    7.543293e+07
2017-03    7.459200e+07
2017-10    7.223356e+07
2017-06    7.191025e+07
2017-05    6.984081e+07
2018-06    6.936298e+07
2018-10    6.857377e+07
2018-02    6.821900e+07
2017-02    6.666570e+07
2018-03    6.664240e+07
Freq: M, Name: Quantity, dtype: float64

## Modin

In [68]:
%%time

mdf["OrderMonth"] = mdf["OrderDate"].dt.to_period('M')
mdf.groupby(by='OrderMonth').sum().sort_values(by="Quantity", ascending=False).head(10)["Quantity"]

CPU times: total: 93.8 ms
Wall time: 1.33 s




OrderMonth
2017-11    7.543293e+07
2017-03    7.459200e+07
2017-10    7.223356e+07
2017-06    7.191025e+07
2017-05    6.984081e+07
2018-06    6.936298e+07
2018-10    6.857377e+07
2018-02    6.821900e+07
2017-02    6.666570e+07
2018-03    6.664240e+07
Name: Quantity, dtype: float64

# Produkte mit den wenigsten Bestellungen pro Jahr

## Pandas

In [7]:
%%time

""" Produkte die pro Jahr am wenigsten gekauft wurden - nach Anzahl Orders von Produkt """
df["Year"] = df["OrderDate"].dt.to_period("Y")
years = df.sort_values(by="OrderDate")["Year"].unique()

for year in years:
    print("Year " + str(year))
    products_year = df.groupby(by=["Year"]).get_group(year)
    print(products_year.groupby(by=["ProductId"]).count().sort_values(by=["Quantity", "ProductId"]).head(10)["OrderId"])
    print()


Year 2015
ProductId
48222     1
116532    1
Name: OrderId, dtype: int64

Year 2016
ProductId
2     1
9     1
16    1
17    1
18    1
38    1
39    1
43    1
50    1
56    1
Name: OrderId, dtype: int64

Year 2017
ProductId
231     1
429     1
441     1
1014    1
1342    1
1413    1
1621    1
1663    1
1675    1
1709    1
Name: OrderId, dtype: int64

Year 2018
ProductId
204     1
248     1
429     1
469     1
593     1
853     1
935     1
939     1
978     1
1019    1
Name: OrderId, dtype: int64

CPU times: total: 2.98 s
Wall time: 3.25 s


## Modin

In [10]:
%%time

mdf["Year"] = mdf["OrderDate"].dt.to_period("Y")
years = mdf.sort_values(by="OrderDate")["Year"].unique()
for year in years:
    print("Year " + str(year))
    products_year = mdf.groupby(by=["Year"]).get_group(year)
    print(products_year.groupby(by=["ProductId"]).count().sort_values(by=["Quantity", "ProductId"]).head(10)["OrderId"])
    print()



Year 2015




ProductId
48222     1
116532    1
Name: OrderId, dtype: int64

Year 2016
ProductId
2     1
9     1
16    1
17    1
18    1
38    1
39    1
43    1
50    1
56    1
Name: OrderId, dtype: int64

Year 2017
ProductId
231     1
429     1
441     1
1014    1
1342    1
1413    1
1621    1
1663    1
1675    1
1709    1
Name: OrderId, dtype: int64

Year 2018
ProductId
204     1
248     1
429     1
469     1
593     1
853     1
935     1
939     1
978     1
1019    1
Name: OrderId, dtype: int64

CPU times: total: 4.53 s
Wall time: 9 s


[2m[36m(apply_list_of_funcs pid=160480)[0m 

# Produkte, die sich in der Vorweihnachtszeit jedes Jahres am besten verkaufen

## Pandas

In [7]:
%%time
import datetime

df["Year"] = df["OrderDate"].dt.to_period("Y")
years = df.sort_values(by="OrderDate")["Year"].unique()

for year in years:
    print("Year " + str(year))
    products_year = df.groupby(by=["Year"]).get_group(year)
    startdate = datetime.datetime(year.year, 11, 16)
    enddate = datetime.datetime(year.year, 12, 23)

    mask = (products_year["OrderDate"] >= startdate) & (products_year["OrderDate"] <= enddate)
    products_christmas_year = products_year.loc[mask]
    print(products_christmas_year.groupby(by=["ProductId"]).count().sort_values(by="Quantity", ascending=False).head(3))


Year 2015
Empty DataFrame
Columns: [CustomerId, OrderId, ProductGroupId, Quantity, OrderDate, Year]
Index: []
Year 2016
           CustomerId  OrderId  ProductGroupId  Quantity  OrderDate  Year
ProductId                                                                
22307             818      818             818       818        818   818
21346             302      302             302       302        302   302
22275             258      258             258       258        258   258
Year 2017
           CustomerId  OrderId  ProductGroupId  Quantity  OrderDate  Year
ProductId                                                                
22307            1199     1199            1199      1199       1199  1199
21346             981      981             981       981        981   981
1289              372      372             372       372        372   372
Year 2018
           CustomerId  OrderId  ProductGroupId  Quantity  OrderDate  Year
ProductId                                     

## Modin


In [10]:
%%time
import datetime

mdf["Year"] = mdf["OrderDate"].dt.to_period("Y")
years = mdf.sort_values(by="OrderDate")["Year"].unique()

for year in years:
    print("Year " + str(year))
    products_year = mdf.groupby(by=["Year"]).get_group(year)
    startdate = datetime.datetime(year.year, 11, 16)
    enddate = datetime.datetime(year.year, 12, 23)

    mask = (products_year["OrderDate"] >= startdate) & (products_year["OrderDate"] <= enddate)
    products_christmas_year = products_year.loc[mask]
    print(products_christmas_year.groupby(by=["ProductId"]).count().sort_values(by="Quantity", ascending=False).head(3))



Year 2015




Empty DataFrame
Columns: [CustomerId, OrderId, ProductGroupId, Quantity, OrderDate, Year]
Index: []
Year 2016




           CustomerId  OrderId  ProductGroupId  Quantity  OrderDate  Year
ProductId                                                                
22307             818      818             818       818        818   818
21346             302      302             302       302        302   302
22275             258      258             258       258        258   258
Year 2017




           CustomerId  OrderId  ProductGroupId  Quantity  OrderDate  Year
ProductId                                                                
22307            1199     1199            1199      1199       1199  1199
21346             981      981             981       981        981   981
1289              372      372             372       372        372   372
Year 2018




           CustomerId  OrderId  ProductGroupId  Quantity  OrderDate  Year
ProductId                                                                
22307             643      643             643       643        643   643
40                437      437             437       437        437   437
682               408      408             408       408        408   408
CPU times: total: 5.8 s
Wall time: 8.89 s




# Jährliche Wachstumsrate


## Pandas

In [14]:
%%time

""" Jährliche Wachstumsrate gemessen an der Quantity """
df["Year"] = df["OrderDate"].dt.to_period("Y")
years = df.sort_values(by="OrderDate")["Year"].unique()
year_quantities = []

for year in years:
    products_year = df.groupby(by=["Year"]).get_group(year)

    quantities = products_year["Quantity"].sum()
    year_quantities.append((year.year, quantities))

print(year_quantities)
year_growthrate_quantities = []

prev = None
for i in range(len(year_quantities)):
    if prev is None:
        year_growthrate_quantities.append(
            ("Year: " + str(year_quantities[i][0]), "100%", "Sum of quantities: " + str(year_quantities[i][1])))
        prev = year_quantities[i]
    else:
        growthrate = (year_quantities[i][1] / prev[1] - 1) * 100
        year_growthrate_quantities.append(("Year: " + str(year_quantities[i][0]), str(int(growthrate)) + "%",
                                           "Sum of quantities: " + str(year_quantities[i][1])))
        prev = year_quantities[i]

print(year_growthrate_quantities)

[(2015, 244.0), (2016, 32456285.539999995), (2017, 800175941.4219998), (2018, 732054503.7000006)]
[('Year: 2015', '100%', 'Sum of quantities: 244.0'), ('Year: 2016', '13301656%', 'Sum of quantities: 32456285.539999995'), ('Year: 2017', '2365%', 'Sum of quantities: 800175941.4219998'), ('Year: 2018', '-8%', 'Sum of quantities: 732054503.7000006')]
CPU times: total: 2.39 s
Wall time: 2.68 s


## Modin


In [18]:
%%time

""" Jährliche Wachstumsrate gemessen an der Quantity """
mdf["Year"] = mdf["OrderDate"].dt.to_period("Y")
years = mdf.sort_values(by="OrderDate")["Year"].unique()
year_quantities = []

for year in years:
    products_year = mdf.groupby(by=["Year"]).get_group(year)

    quantities = products_year["Quantity"].sum()
    year_quantities.append((year.year, quantities))
    #print(products_year.groupby(by=["ProductId"]).sum().sort_values(by=["Quantity", "ProductId"]).head(10))

print(year_quantities)
year_growthrate_quantities = []

prev = None
for i in range(len(year_quantities)):
    if prev is None:
        year_growthrate_quantities.append(
            ("Year: " + str(year_quantities[i][0]), "100%", "Sum of quantities: " + str(year_quantities[i][1])))
        prev = year_quantities[i]
    else:
        growthrate = (year_quantities[i][1] / prev[1] - 1) * 100
        year_growthrate_quantities.append(("Year: " + str(year_quantities[i][0]), str(int(growthrate)) + "%",
                                           "Sum of quantities: " + str(year_quantities[i][1])))
        prev = year_quantities[i]

print(year_growthrate_quantities)



[(2015, 244.0), (2016, 32456285.54), (2017, 800175941.4219999), (2018, 732054503.7)]
[('Year: 2015', '100%', 'Sum of quantities: 244.0'), ('Year: 2016', '13301656%', 'Sum of quantities: 32456285.54'), ('Year: 2017', '2365%', 'Sum of quantities: 800175941.4219999'), ('Year: 2018', '-8%', 'Sum of quantities: 732054503.7')]
CPU times: total: 4.11 s
Wall time: 6.52 s
