In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import dask.array as da
from datetime import datetime

### Task 1
Wczytaj plik zamowienia.csv do ramki pandas, a następnie w kilku miejscach (ale nie w pierwszych 10 wierszach) wstaw wartość NaN, aby zasymulować wartości brakujące. Zapisz ramkę do pliku zamowienia_missing.csv. Wczytaj teraz plik do ramki Dask i sprawdź jakie typy danych zostały przydzielone. Czy zgadzają się z typami z oryginalnego pliku? Wykonaj dowolne obliczenia na całej ramce Dask, aby wymusić wywołanie .compute(). Czy pojawił się błąd dotyczący niespójności typów danych? Spróbuj uruchomić kilka razy funkcję wczytywania danych do ramki Dask dataframe z różnymi wartościami parametru samples. Dokumentacja dask.dataframe.read_csv(): https://docs.dask.org/en/stable/generated/dask.dataframe.read_csv.html

In [2]:
df = pd.read_csv('../../lab_01/examples/zamowienia.csv', header=0, sep=';')

In [3]:
df.head()

Unnamed: 0,Kraj,Sprzedawca,Data zamowienia,idZamowienia,Utarg
0,Polska,Kowalski,2003-07-16,10248,440.0
1,Polska,Sowiński,2003-07-10,10249,1863.4
2,Niemcy,Peacock,2003-07-12,10250,1552.6
3,Niemcy,Leverling,2003-07-15,10251,654.06
4,Niemcy,Peacock,2003-07-11,10252,3597.9


In [4]:
df.loc[15, 'Utarg'] = np.nan
df.loc[33, 'Kraj'] = np.nan
df.loc[33, 'Utarg'] = np.nan
df.loc[34, 'Utarg'] = np.nan
df.loc[35, 'Utarg'] = np.nan
df.loc[36, 'Utarg'] = np.nan
df.loc[37, 'Utarg'] = np.nan
df.loc[37, 'Kraj'] = np.nan
df.loc[36, 'Kraj'] = np.nan

In [5]:
df.to_csv('data/zamowienia_missing.csv', index=False)

In [6]:
dask_df = dd.read_csv('data/zamowienia_missing.csv')

In [7]:
print(dask_df.dtypes)

Kraj               string[pyarrow]
Sprzedawca         string[pyarrow]
Data zamowienia    string[pyarrow]
idZamowienia                 int64
Utarg                      float64
dtype: object


In [8]:
result = dask_df['Utarg'].sum().compute()
result

np.float64(1221883.17)

In [9]:
dask_df = dd.read_csv('data/zamowienia_missing.csv', sample=52)

In [10]:
print(dask_df.dtypes)

Kraj               string[pyarrow]
Sprzedawca         string[pyarrow]
Data zamowienia    string[pyarrow]
idZamowienia                 int64
Utarg                      float64
dtype: object


### Task 2
Ze strony https://docs.dask.org/en/stable/dashboard.html skonfiguruj plugin Dask dashboard dla Jupyter Lab i przetestuj jego działanie.

### Task 3
Skonfiguruj lokalny klaster (Client) tak, aby nie zaalokował wszystkich zasobów (np. zostaw 8 GB RAM dla systemu hosta + 2 rdzenie). Pobierz dane udostępnione na poprzednich zajęciach (https://huggingface.co/datasets/vargr/private_instagram/tree/main/data) i załaduj do ramki Dask tyle części ile zdołasz w formie bez optymalizacji. Zmierz czas tej operacji.

In [11]:
from dask.distributed import LocalCluster, Client

cluster = LocalCluster(
    n_workers=1,
    threads_per_worker=4,
    memory_limit='6GB',
)

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 4,Total memory: 5.59 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:50542,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 5.59 GiB

0,1
Comm: tcp://127.0.0.1:50549,Total threads: 4
Dashboard: http://127.0.0.1:50550/status,Memory: 5.59 GiB
Nanny: tcp://127.0.0.1:50545,
Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-zrzvrhpg,Local directory: C:\Users\Adam\AppData\Local\Temp\dask-scratch-space\worker-zrzvrhpg


In [12]:
%%time
dask_df = dd.read_parquet(['../../lab_01/tasks/data/0000.parquet', '../../lab_01/tasks/data/0001.parquet'])

CPU times: total: 62.5 ms
Wall time: 100 ms


### Task 4
Wykonaj kilka operacji na klastrze lokalnym z danymi z zadania 3:
* wyświetl top 10 użytkowników z najwyższą liczbą like'ów,
* pobierz dane tylko za pierwsze półrocze 2019 roku.

Każdorazowo zmierz i wyświetl czas operacji i obserwuj dashboard.

In [13]:
dask_df.head()

Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
0,28370919,3496776,BXdjjUlgcgq,2237947779,2017-08-06 20:06:57,2,Wreckloose! Deevalley bike park laps on the @i...,80,0,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,travel_&_adventure
1,13623950,3496776,BeyPed5hKj9,2237947779,2018-02-04 19:35:20,1,The dirty south was prime today. Top day with ...,86,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
2,28370905,3496776,Bunhd1DFVAG,2237947779,2019-03-05 08:03:11,1,Tech Tuesday. Been flat out on the tools. Got ...,168,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,science_&_technology
3,28370907,3496776,Bppi85gliQK,2237947779,2018-11-01 20:17:41,1,"On the tools, my favourite wheel builds @stans...",102,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
4,32170690,3496776,BuDfIyslzfw,2237947779,2019-02-19 08:10:11,1,Solid effort on the bar turn.\nFully turned.\n...,145,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life


In [14]:
%%time
top_users = dask_df.groupby('profile_id')['likes'].sum().nlargest(10).compute()
print(f'Top 10 users: {top_users}')

Top 10 users: profile_id
25025320     29864166
253625977    16457870
10245870     15019135
907025384    13352324
19769622     11709658
9446524      10109849
50417061      9253425
369719055     9104884
6849281       7378589
9429520       6261284
Name: likes, dtype: int64
CPU times: total: 0 ns
Wall time: 857 ms


In [15]:
%%time
filtered_df = dask_df[(dask_df['date'] >= '2019-01-01') & (dask_df['date'] < '2019-07-01')].compute()
filtered_df.head()

CPU times: total: 1.05 s
Wall time: 6.76 s


Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
2,28370905,3496776,Bunhd1DFVAG,2237947779,2019-03-05 08:03:11,1,Tech Tuesday. Been flat out on the tools. Got ...,168,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,science_&_technology
4,32170690,3496776,BuDfIyslzfw,2237947779,2019-02-19 08:10:11,1,Solid effort on the bar turn.\nFully turned.\n...,145,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
5,14315358,3496776,BxJsMDpA2yH,2237947779,2019-05-07 08:33:51,1,Annual springtime flora picture.\nTurn bars in...,124,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,arts_&_culture
6,8304346,3496776,Bt5LFpZlm3z,2237947779,2019-02-15 08:02:35,1,Laps in spring like conditions. Getting these ...,150,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,sports
7,14315346,3496776,BxZIzaQhS-o,2237947779,2019-05-13 08:32:30,1,Cheers Scotland 🏴󠁧󠁢󠁳󠁣󠁴󠁿 See you in a few weeks...,166,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,sports


### Task 5
Wczytaj te same dane do ramki Dask co w zadaniu 3, ale podaj typy danych, które zostały wybrane w procesie optymalizacji wykonanej w zadaniach z lab 01. Porównaj czas ładowania z zadaniem 3. Wykonaj również te same operacje co w zadaniu 4 i porównaj czas. Śledź wykonanie zadań patrząć na graf wywołań.

In [20]:
dask_df.dtypes

sid                              int64
sid_profile                      int64
post_id                string[pyarrow]
profile_id                       int64
date                   string[pyarrow]
post_type                        int64
description            string[pyarrow]
likes                            int64
comments                         int64
username               string[pyarrow]
bio                    string[pyarrow]
following                        int64
followers                        int64
num_posts                        int64
is_business_account               bool
lang                   string[pyarrow]
category               string[pyarrow]
dtype: object

In [21]:
dask_df['sid'] = dask_df['sid'].astype(np.int32)
dask_df['sid_profile'] = dask_df['sid_profile'].astype(np.int32)
dask_df['post_id'] = dask_df['post_id'].astype('string')
dask_df['profile_id'] = dask_df['profile_id'].astype('string')
dask_df['date'] = dd.to_datetime(dask_df['date'])
dask_df['post_type'] = dask_df['post_type'].astype('category')
dask_df['description'] = dask_df['description'].astype('string')
dask_df['likes'] = dask_df['likes'].astype(np.int32)
dask_df['comments'] = dask_df['comments'].astype(np.int16)
dask_df['username'] = dask_df['username'].astype('category')
dask_df['bio'] = dask_df['bio'].astype('category')
dask_df['following'] = dask_df['following'].astype(np.int32)
dask_df['followers'] = dask_df['followers'].astype(np.int32)
dask_df['num_posts'] = dask_df['num_posts'].astype(np.int32)
dask_df['is_business_account'] = dask_df['is_business_account'].astype('bool')
dask_df['lang'] = dask_df['lang'].astype('category')
dask_df['category'] = dask_df['category'].astype('category')

In [22]:
dask_df.dtypes

sid                             int32
sid_profile                     int32
post_id                string[python]
profile_id             string[python]
date                   datetime64[ns]
post_type                    category
description            string[python]
likes                           int32
comments                        int16
username                     category
bio                          category
following                       int32
followers                       int32
num_posts                       int32
is_business_account              bool
lang                         category
category                     category
dtype: object

In [23]:
dask_df.head()

Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
0,28370919,3496776,BXdjjUlgcgq,2237947779,2017-08-06 20:06:57,2,Wreckloose! Deevalley bike park laps on the @i...,80,0,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,travel_&_adventure
1,13623950,3496776,BeyPed5hKj9,2237947779,2018-02-04 19:35:20,1,The dirty south was prime today. Top day with ...,86,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
2,28370905,3496776,Bunhd1DFVAG,2237947779,2019-03-05 08:03:11,1,Tech Tuesday. Been flat out on the tools. Got ...,168,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,science_&_technology
3,28370907,3496776,Bppi85gliQK,2237947779,2018-11-01 20:17:41,1,"On the tools, my favourite wheel builds @stans...",102,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
4,32170690,3496776,BuDfIyslzfw,2237947779,2019-02-19 08:10:11,1,Solid effort on the bar turn.\nFully turned.\n...,145,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life


In [24]:
%%time
top_users = dask_df.groupby('profile_id')['likes'].sum().nlargest(10).compute()
print(f'Top 10 users: {top_users}')

Top 10 users: profile_id
25025320     29864166
253625977    16457870
19769622     11709658
369719055     9104884
9429520       6261284
25236007      6146072
16668152      5829086
627934998     4991864
189379530     4756493
268167552     4693524
Name: likes, dtype: int32
CPU times: total: 62.5 ms
Wall time: 1.56 s


In [25]:
%%time
filtered_df = dask_df[(dask_df['date'] >= '2019-01-01') & (dask_df['date'] < '2019-07-01')].compute()
filtered_df.head()

CPU times: total: 781 ms
Wall time: 8.28 s


Unnamed: 0,sid,sid_profile,post_id,profile_id,date,post_type,description,likes,comments,username,bio,following,followers,num_posts,is_business_account,lang,category
2,28370905,3496776,Bunhd1DFVAG,2237947779,2019-03-05 08:03:11,1,Tech Tuesday. Been flat out on the tools. Got ...,168,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,science_&_technology
4,32170690,3496776,BuDfIyslzfw,2237947779,2019-02-19 08:10:11,1,Solid effort on the bar turn.\nFully turned.\n...,145,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,diaries_&_daily_life
5,14315358,3496776,BxJsMDpA2yH,2237947779,2019-05-07 08:33:51,1,Annual springtime flora picture.\nTurn bars in...,124,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,arts_&_culture
6,8304346,3496776,Bt5LFpZlm3z,2237947779,2019-02-15 08:02:35,1,Laps in spring like conditions. Getting these ...,150,3,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,sports
7,14315346,3496776,BxZIzaQhS-o,2237947779,2019-05-13 08:32:30,1,Cheers Scotland 🏴󠁧󠁢󠁳󠁣󠁴󠁿 See you in a few weeks...,166,2,andylund_,"Professional Bicycle technician, Intense Racin...",520,1204,494,False,en,sports


### Task 6
Podziel tablicę darr z przykładów na inne liczby chunków (eksperymentuj) i wykonaj te same obliczenie (średnia). Dla każdej liczby chunków wypisz czas obliczeń (wykonaj to samo obliczenie minimum 10 razy, aby nieco uwiarygodnić wyniki i uśrednij) i porównaj wyniki. Napisz wniosek o wynikach swoich eksperymentów i automatycznego podziału na chunki. Czy udało Ci się osiągnąć lepszą wydajność niż przy domyślnych ustawieniach?

In [5]:
times = []
for _ in range(10):
    darr = da.random.normal(5, 0.2, size=(20_000, 20_000))
    darr_mean = darr.mean(axis=0)

    start_time = datetime.now()
    darr_mean.compute()
    times.append(datetime.now() - start_time)

avg_time = np.mean(times)
print(f'Average time for default settings {avg_time}')

Average time for default settings 0:00:03.463077


In [6]:
chunk_sizes = [1000, 1024, 2000, 2048, 4000, 4096, 8000, 8192, 16_000, 16_384]
results = {}

for chunk_size in chunk_sizes:
    times = []
    for _ in range(10):
        darr = da.random.normal(5, 0.2, size=(20_000, 20_000), chunks=(chunk_size, chunk_size))
        darr_mean = darr.mean(axis=0)

        start_time = datetime.now()
        darr_mean.compute()
        times.append(datetime.now() - start_time)

    avg_time = np.mean(times)
    results[chunk_size] = avg_time

results_df = pd.DataFrame(list(results.items()), columns=['Chunk Size', 'Average Time (s)'])
results_df

Unnamed: 0,Chunk Size,Average Time (s)
0,1000,0 days 00:00:02.919994
1,1024,0 days 00:00:02.671516
2,2000,0 days 00:00:02.600619
3,2048,0 days 00:00:02.581085
4,4000,0 days 00:00:02.879106
5,4096,0 days 00:00:02.759572
6,8000,0 days 00:00:03.297687
7,8192,0 days 00:00:03.338199
8,16000,0 days 00:00:08.519075
9,16384,0 days 00:00:08.401654
