In [3]:
from sklearn.datasets import make_classification
import pandas as pd
from random import random 
import numpy as np 
import os

from joblib import Parallel, delayed
import multiprocessing
from multiprocessing import  Pool

# Parameters of the synthetic dataset: 
n_samples = 2500
n_features = 50 
n_informative = 12
n_redundant = 10 
n_classes = 2

df = make_classification(n_samples=n_samples,
                         n_features=n_features,
                         n_informative=n_informative,
                         n_redundant=n_redundant,
                         n_classes=n_classes)

pandas_df = pd.DataFrame(df[0])
pandas_df.columns = [f"col{x}" for x in range(pandas_df.shape[1])]

pandas_df.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col40,col41,col42,col43,col44,col45,col46,col47,col48,col49
0,-0.009464,-1.460358,0.063422,-11.770144,-0.08253,-0.286216,-0.253751,2.760976,5.026142,-0.084469,...,-0.802004,-0.821326,0.611581,-0.562413,-2.77573,1.792035,-2.192888,-0.862928,-0.142954,-0.190826
1,0.668427,0.859396,0.526045,-5.100497,0.431506,0.054752,-5.666089,2.613562,8.478456,-0.716214,...,0.818855,5.33275,0.292636,-0.056558,-0.844323,0.674941,-2.219548,0.930998,0.092781,0.432353
2,0.715195,1.947947,-0.005915,-0.806689,1.612645,-0.3176,-1.439803,2.683543,3.520398,1.1383,...,-0.883868,1.845544,0.932884,0.847298,0.51895,-2.278868,0.811182,-0.727667,0.695003,0.738831
3,-0.086437,-0.252499,-0.117254,-3.0924,-0.334766,-1.161551,0.280496,-5.294142,0.480085,-0.64597,...,0.995946,-3.117506,0.737141,0.453879,-3.62682,0.389112,0.55012,-1.125876,-0.099176,0.924793
4,-0.179497,4.411676,1.278154,2.71138,-0.895123,-0.22296,3.970051,-4.227705,-0.710402,0.780701,...,-0.877904,-3.307484,0.87367,-1.073481,-1.057268,-0.805906,1.686013,-2.688898,0.844989,-0.605183


In [4]:
#pandas
%%time
pandas_df['col0'] = pandas_df['col0'].apply(lambda x: x + 100.0 + random())
print(pandas_df.head())

         col0      col1      col2       col3      col4      col5      col6  \
0  100.430251 -1.460358  0.063422 -11.770144 -0.082530 -0.286216 -0.253751   
1  101.565343  0.859396  0.526045  -5.100497  0.431506  0.054752 -5.666089   
2  100.916963  1.947947 -0.005915  -0.806689  1.612645 -0.317600 -1.439803   
3  100.314289 -0.252499 -0.117254  -3.092400 -0.334766 -1.161551  0.280496   
4  100.636079  4.411676  1.278154   2.711380 -0.895123 -0.222960  3.970051   

       col7      col8      col9  ...     col40     col41     col42     col43  \
0  2.760976  5.026142 -0.084469  ... -0.802004 -0.821326  0.611581 -0.562413   
1  2.613562  8.478456 -0.716214  ...  0.818855  5.332750  0.292636 -0.056558   
2  2.683543  3.520398  1.138300  ... -0.883868  1.845544  0.932884  0.847298   
3 -5.294142  0.480085 -0.645970  ...  0.995946 -3.117506  0.737141  0.453879   
4 -4.227705 -0.710402  0.780701  ... -0.877904 -3.307484  0.873670 -1.073481   

      col44     col45     col46     col47     col4

In [6]:
#Dask

from dask import dataframe as ddf 

dask_df = ddf.from_pandas(pandas_df, npartitions=20)
dask_df = dask_df.persist()

In [7]:
%%time 

dask_df['col1'] = dask_df['col1'] + 100.0 + random()
print(dask_df.compute().head())

         col0        col1      col2      col3      col4      col5      col6  \
0  101.875949  100.238201 -0.075026  2.624420 -3.704402 -1.918315  0.089059   
1   99.611722   99.307375 -2.174225 -0.917264 -0.089184 -1.153454 -0.448978   
2  101.205693   99.502391  1.081483  3.601620  4.672138 -1.840928 -2.557463   
3  100.838625  100.585321 -1.073540 -1.551024  0.000054 -1.387267  1.253123   
4   98.084737  102.016249  0.788724 -1.612044 -2.351790 -0.262833 -0.403415   

       col7      col8      col9  ...     col40     col41     col42     col43  \
0  5.133089  0.553818  1.661442  ... -1.144562 -0.562258 -5.645755 -1.282278   
1 -3.372962 -0.721857 -0.046377  ... -0.299749 -0.778446  1.709486 -1.171068   
2 -1.997149 -1.557514  0.423847  ...  1.125930  0.694730 -0.779041 -0.026908   
3  6.175670 -0.758950  0.354613  ...  1.275407  1.308126  1.004263 -1.139492   
4  2.797223 -1.479554 -0.878193  ... -0.225395 -1.175926 -1.584909  0.235515   

      col44     col45     col46      col47  

In [1]:
pip install modin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import os
import modin.pandas as pd
os.environ["MODIN_ENGINE"] = "dask"
import modin.pandas as mpd
modin_df = mpd.DataFrame(pandas_df)


    from distributed import Client

    client = Client()



In [6]:
%%time 

modin_df['col2'] = modin_df['col2'] + 100.0 + random()
print(modin_df.head())

         col0      col1        col2       col3      col4      col5      col6  \
0  100.430251 -1.460358  100.068096 -11.770144 -0.082530 -0.286216 -0.253751   
1  101.565343  0.859396  100.530718  -5.100497  0.431506  0.054752 -5.666089   
2  100.916963  1.947947   99.998758  -0.806689  1.612645 -0.317600 -1.439803   
3  100.314289 -0.252499   99.887420  -3.092400 -0.334766 -1.161551  0.280496   
4  100.636079  4.411676  101.282827   2.711380 -0.895123 -0.222960  3.970051   

       col7      col8      col9  ...     col40     col41     col42     col43  \
0  2.760976  5.026142 -0.084469  ... -0.802004 -0.821326  0.611581 -0.562413   
1  2.613562  8.478456 -0.716214  ...  0.818855  5.332750  0.292636 -0.056558   
2  2.683543  3.520398  1.138300  ... -0.883868  1.845544  0.932884  0.847298   
3 -5.294142  0.480085 -0.645970  ...  0.995946 -3.117506  0.737141  0.453879   
4 -4.227705 -0.710402  0.780701  ... -0.877904 -3.307484  0.873670 -1.073481   

      col44     col45     col46     co

In [2]:
def parallelize_dataframe(df, func, num_cores=2):
    """ Utility function that distributes the application 
    of function func on dataframe df by using Pool()
    """
    dfs = np.array_split(df, num_cores)
    with Pool(num_cores) as pl: 
        df = pd.concat(pl.map(func, dfs))
        pl.close()
        pl.join()
    return df

def simple_transformation(df): 
    """Add random value to a column 
    """
    df['col3'] = df['col3'].apply(lambda x: x+ 100.0 + random())
    return df

In [3]:
%%time

num_cores = multiprocessing.cpu_count()
print(f"nb of cores used {num_cores}")
pool_df = parallelize_dataframe(pandas_df, simple_transformation, num_cores=num_cores)
pool_df.head()

nb of cores used 2
CPU times: user 29.4 ms, sys: 25 ms, total: 54.4 ms
Wall time: 117 ms


Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col40,col41,col42,col43,col44,col45,col46,col47,col48,col49
0,1.038743,-0.702637,-0.075026,102.89863,-3.704402,-1.918315,0.089059,5.133089,0.553818,1.661442,...,-1.144562,-0.562258,-5.645755,-1.282278,0.579737,-2.696446,-7.937043,-11.85236,0.195557,-0.133243
1,-0.763703,-1.633462,-2.174225,99.167273,-0.089184,-1.153454,-0.448978,-3.372962,-0.721857,-0.046377,...,-0.299749,-0.778446,1.709486,-1.171068,0.010046,3.890604,-0.021986,1.332187,-0.060251,0.055416
2,1.031059,-1.438446,1.081483,104.182095,4.672138,-1.840928,-2.557463,-1.997149,-1.557514,0.423847,...,1.12593,0.69473,-0.779041,-0.026908,0.16907,-4.882541,0.426914,2.935354,-0.245684,-4.196912
3,0.375474,-0.355517,-1.07354,99.046013,5.4e-05,-1.387267,1.253123,6.17567,-0.75895,0.354613,...,1.275407,1.308126,1.004263,-1.139492,1.226209,0.314943,1.963744,0.605683,-0.541341,-1.616178
4,-1.920057,1.075412,0.788724,99.237245,-2.35179,-0.262833,-0.403415,2.797223,-1.479554,-0.878193,...,-0.225395,-1.175926,-1.584909,0.235515,2.369038,-1.822743,-0.736763,-3.697294,2.859466,-0.296855


Job lib

In [4]:
%%time 
df2s = np.array_split(pandas_df, num_cores)
results = Parallel(n_jobs=num_cores, prefer="threads")(delayed(simple_transformation)(x) for x in df2s )
df3 = pd.concat(results)
df3.head()

CPU times: user 23 ms, sys: 4.43 ms, total: 27.5 ms
Wall time: 38.3 ms


Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col40,col41,col42,col43,col44,col45,col46,col47,col48,col49
0,1.038743,-0.702637,-0.075026,102.822652,-3.704402,-1.918315,0.089059,5.133089,0.553818,1.661442,...,-1.144562,-0.562258,-5.645755,-1.282278,0.579737,-2.696446,-7.937043,-11.85236,0.195557,-0.133243
1,-0.763703,-1.633462,-2.174225,100.032593,-0.089184,-1.153454,-0.448978,-3.372962,-0.721857,-0.046377,...,-0.299749,-0.778446,1.709486,-1.171068,0.010046,3.890604,-0.021986,1.332187,-0.060251,0.055416
2,1.031059,-1.438446,1.081483,103.793463,4.672138,-1.840928,-2.557463,-1.997149,-1.557514,0.423847,...,1.12593,0.69473,-0.779041,-0.026908,0.16907,-4.882541,0.426914,2.935354,-0.245684,-4.196912
3,0.375474,-0.355517,-1.07354,98.902679,5.4e-05,-1.387267,1.253123,6.17567,-0.75895,0.354613,...,1.275407,1.308126,1.004263,-1.139492,1.226209,0.314943,1.963744,0.605683,-0.541341,-1.616178
4,-1.920057,1.075412,0.788724,99.270163,-2.35179,-0.262833,-0.403415,2.797223,-1.479554,-0.878193,...,-0.225395,-1.175926,-1.584909,0.235515,2.369038,-1.822743,-0.736763,-3.697294,2.859466,-0.296855
