In [1]:
# Parallel Computing

import multiprocessing as mp
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

# Data Ingestion 
import pandas as pd
# Text Processing 
import re 
from nltk.corpus import stopwords
import string

In [2]:
n_workers =  mp.cpu_count()*2

print(f"{n_workers} workers are available")

8 workers are available


In [3]:

%%time
file_name= r"C:\Users\canut\Desktop\archive\US_Accidents.csv"
df = pd.read_csv(file_name)

print(f"Shape:{df.shape}\n\nColumn Names:\n{df.columns}\n")

Shape:(2845342, 47)

Column Names:
Index(['ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Number', 'Street',
       'Side', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

CPU times: total: 31.8 s
Wall time: 34.8 s


In [9]:
df =df.sample(frac=0.5, replace=True, random_state=1)

In [10]:
def clean_text(text): 
  # Remove stop words
  stops = stopwords.words("english")
  text = " ".join([word for word in text.split() if word 
  not in stops])
  # Remove Special Characters
  text = text.translate(str.maketrans('', '', string.punctuation))
  return re.sub(' +',' ', text)

In [None]:
%time
tqdm.pandas()
df['Description'] = df['Description'].progress_apply(clean_text)

df.head()

In [None]:
def text_parallel_clean(array):
  return Parallel(
      n_jobs=n_workers, backend="multiprocessing")(
          delayed(clean_text)(text) for text in tqdm(array))

In [7]:
%%time
df['Description'] = text_parallel_clean(df['Description'])

In [None]:
def proc_batch(batch):
  return [clean_text(text) for text in batch]

In [None]:
def batch_file(array,n_workers):
  file_len = len(array)
  batch_size = round(file_len / n_workers)
  return [
      array[i:i + batch_size] for i in tqdm(range(0, file_len, batch_size))
  ]

batches = batch_file(df['Description'],n_workers)

In [None]:
%%time
batch_output = Parallel(n_jobs=n_workers,backend="multiprocessing")(
  delayed(proc_batch) (batch) for batch in tqdm(batches))

df['Description'] = [j for i in batch_output for j in i]

In [None]:
%%time
from tqdm.contrib.concurrent import process_map
batch = round(len(df)/n_workers)

df['Description'] = process_map(clean_text,df['Description'], max_workers=n_workers, chunksize=batch)