In [1]:
# Parallel Computing

import multiprocessing as mp
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

# Data Ingestion 
import pandas as pd
# Text Processing 
import re 
from nltk.corpus import stopwords
import string

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/dani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
n_workers =  mp.cpu_count()*2

print(f"{n_workers} workers are available")

8 workers are available


In [3]:
%%time

file_name= r"/home/dani/Desktop/US_Accidents.csv"
df = pd.read_csv(file_name)
df =df.sample(frac=0.5, replace=True, random_state=1)

print(f"Shape:{df.shape}\n\nColumn Names:\n{df.columns}\n")

Shape:(1422671, 47)

Column Names:
Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Number', 'Street',
       'Side', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

CPU times: user 1min, sys: 6.14 s, total: 1min 6s
Wall time: 4min 11s


In [4]:
def clean_text(text): 
  # Remove stop words
  stops = stopwords.words("english")
  text = " ".join([word for word in text.split() if word 
  not in stops])
  # Remove Special Characters
  text = text.translate(str.maketrans('', '', string.punctuation))
  return re.sub(' +',' ', text)

### Pandas operation

In [5]:
%%time
tqdm.pandas()
df['Description_P'] = df['Description'].progress_apply(clean_text)

df.head()

  0%|          | 0/1422671 [00:00<?, ?it/s]

CPU times: user 5min 1s, sys: 25.8 s, total: 5min 27s
Wall time: 5min 27s


Unnamed: 0,ID,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Description_P
128037,A-128038,2,2016-08-26 15:00:46,2016-08-26 21:00:46,37.64196,-122.09142,37.644005,-122.092616,0.156,Ramp to CA-92/Jackson St - Accident.,...,False,False,False,False,False,Day,Day,Day,Day,Ramp CA92Jackson St Accident
491755,A-491756,2,2021-07-17 10:32:00,2021-07-17 11:52:05,35.79134,-117.358959,35.788912,-117.360747,0.195,Incident on SEARLES ST near HOUSE 84500 Drive ...,...,False,False,False,False,False,Day,Day,Day,Day,Incident SEARLES ST near HOUSE 84500 Drive cau...
2568076,A-2568077,4,2018-11-02 00:44:01,2018-11-02 01:13:41,41.960677,-72.853934,41.961746,-72.857197,0.183,Closed between Higley Rd and CT-219/Barkhamste...,...,False,False,False,False,False,Night,Night,Night,Night,Closed Higley Rd CT219Barkhamsted Rd Road clos...
491263,A-491264,2,2021-08-14 03:09:00,2021-08-14 03:42:30,34.08302,-117.69505,34.087155,-117.677911,1.022,Slow traffic on San Bernardino Fwy E - I-10 E ...,...,False,False,False,False,False,Night,Night,Night,Night,Slow traffic San Bernardino Fwy E I10 E Monte ...
836489,A-836490,2,2021-12-02 12:45:06,2021-12-02 14:00:08,38.956156,-77.193235,38.952146,-77.196025,0.315,Incident on I-495 SB near MM 43 Expect delays.,...,False,False,False,False,False,Day,Day,Day,Day,Incident I495 SB near MM 43 Expect delays


### JOBlib

In [6]:
def text_parallel_clean(array):
  return Parallel(
      n_jobs=n_workers, backend="multiprocessing")(
          delayed(clean_text)(text) for text in tqdm(array))

In [7]:
%%time
df['Description'] = text_parallel_clean(df['Description'])

  0%|          | 0/1422671 [00:00<?, ?it/s]

CPU times: user 42.8 s, sys: 3.65 s, total: 46.5 s
Wall time: 2min 38s


### Joblib with batches

In [10]:
def proc_batch(batch):
  return [clean_text(text) for text in batch]

def batch_file(array,n_workers):
  file_len = len(array)
  batch_size = round(file_len / (2*n_workers))
  return [
      array[ix:ix + batch_size] for ix in tqdm(range(0, file_len, batch_size))]

batches = batch_file(df['Description_P'],n_workers)


  0%|          | 0/16 [00:00<?, ?it/s]

  array[ix:ix + batch_size] for ix in tqdm(range(0, file_len, batch_size))]


In [11]:
%%time
batch_output = Parallel(n_jobs=n_workers,backend="multiprocessing")(
  delayed(proc_batch) (batch) for batch in tqdm(batches))


df['Description_P'] = [j for i in batch_output for j in i]

  0%|          | 0/16 [00:00<?, ?it/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



CPU times: user 2.63 s, sys: 958 ms, total: 3.59 s
Wall time: 2min 35s


### Concurrent

In [20]:
%%time
from tqdm.contrib.concurrent import process_map
# batch = round(len(df)/(n_workers*3))
batch = 500


df['Description'] = process_map(clean_text,df['Description'], max_workers=n_workers, chunksize=batch)

  0%|          | 0/1422671 [00:00<?, ?it/s]

CPU times: user 7.28 s, sys: 1.41 s, total: 8.69 s
Wall time: 2min 24s
