# Multi-Processing_And_Multi-threading in Python for ML

#    - Code - walkthrough , Introduction - Session.

In [1]:
!systeminfo


Host Name:                     DESKTOP-673QBFP
OS Name:                       Microsoft Windows 11 Pro
OS Version:                    10.0.26100 N/A Build 26100
OS Manufacturer:               Microsoft Corporation
OS Configuration:              Standalone Workstation
OS Build Type:                 Multiprocessor Free
Registered Owner:              omprakashbest3@gmail.com
Registered Organization:       N/A
Product ID:                    00330-53801-49489-AAOEM
Original Install Date:         12-08-2025, 08:30:11
System Boot Time:              11-09-2025, 09:52:46
System Manufacturer:           LENOVO
System Model:                  20VD
System Type:                   x64-based PC
Processor(s):                  1 Processor(s) Installed.
                               [01]: Intel64 Family 6 Model 140 Stepping 1 GenuineIntel ~2419 Mhz
BIOS Version:                  LENOVO F8CN54WW(V2.17), 17-03-2023
Windows Directory:             C:\WINDOWS
System Directory:              C:\WINDOWS\system3

# Mean of 100 Million Observations

In [2]:
# Generate random 100MM data points 
import numpy as np
n = 100000000
d = np.random.rand(n) # between range 0 to 1. float64
print(d.shape)
print(type(d.shape))

(100000000,)
<class 'tuple'>


In [3]:
import time 
def mean():

    # Sum using for loops. We can use inbuilt Numpy sum operation for better speed.
    sum = 0
    n = d.size
    for i in range(n):
        sum += d[i]

    # Mean 
    mean = sum/n
    return mean

# Time the execution
start_time = time.time() # time.time function is return current time in seconds.
m = mean() # compute mean of 100MM numbers.
end_time = time.time()
print(end_time - start_time)
print(m)

12.24590253829956
0.4999999117222156


# Multi - Processing Code:-

In [4]:
from multiprocessing import Process, Queue
import math

def mean_MP(s, e, q):
    # Sum using for loops, we can use inbuilt NumPy sum operations for better speed.
    sum = 0
    for i in range(s, e + 1):  
        sum += d[i]

    # Mean 
    mean = sum / (e - s + 1) 
    q.put(mean) # placing the mean in Queue
    return

n1 = math.floor(n/2)

q = Queue() # Queues are thread and process safe, for communicating between processes and thread

p1 = Process(target=mean_MP, args=(0, n1, q)) # core 1
p2 = Process(target=mean_MP, args=(n1 + 1, n - 1, q)) # core 2


# Time the executions

start_time2 = time.time()

p1.start()
p2.start()

p1.join() # wait till p1 finishes
p2.join()

m = 0
while not q.empty():
    m += q.get()

m /= 2

end_time2 = time.time()
print(end_time2 - start_time2)
print(m)

0.12935781478881836
0.0


# Multi - Threaded Core:-


In [5]:
from threading import Thread

means = [0, 0]

def mean_MT(s, e, threadNum):

    # Sum using for loops, we can use inbuilt Numpy sum operations for better speed
    sum = 0
    for i in range(s, e+1):
        sum  += d[i]

    # Mean 
    mean = sum /(e - s + 1)
    means[threadNum] = mean # means is a shared variables between the threads
    return

n1 = math.floor(n/2)

th1 = Thread(target=mean_MT, args=(0, n1, 0)) # third APRAM is the thread number 
th2 = Thread(target=mean_MT, args=(n1 + 1, n-1, 1))

# Time the execution
start_time = time.time()

th1.start() # start the thread 
th2.start()

th1.join() # wait till t1 finishes.
th2.join()

m = (means[0] + means[1]) / 2

end_time = time.time()
print(end_time - start_time)
print(m)

13.484285831451416
0.4999999117223269


# Joblib

# Caching of function output values 

In [6]:
from joblib import Memory
cachedir = './' # the current directory
mem = Memory(cachedir)

import numpy as np
a = np.vander(np.arange(3)).astype(np.float64)
square = mem.cache(np.square) # square is a inbuilt functions in numpy
b = square(a)

In [7]:
c = square(a)
# the above call did not trigger an evaluation, it's already execution once, not going to execution again. for saving time.

# Simple Parallel Programming/computing for Loops

In [8]:
import time 
from math import sqrt # inbuilt functions

def f(i):
    # some computations that take time 
    x = 10000
    p = 1
    for j in range(x):
        for k in range(j):
            p *= k
    return sqrt(i ** 2)

# find sqrt of first n numbers
n = 10 # calling function 10 time

# Time executions:- 
start_time = time.time()

for i in range(n):
    f(i)

end_time = time.time()
print(end_time - start_time)

13.645599126815796


In [9]:
# same functions using joblib..
from joblib import Parallel, delayed

start_time = time.time()
a = Parallel(n_jobs=2)(delayed(f)(i) for i in range(n))

# why we need delayed() 'function':

end_time = time.time()

print(end_time - start_time)

8.762311220169067


In [10]:
# Multi threading also using joblib : GIL is an issue
start_time = time.time()

a =  Parallel(n_jobs=2, prefer="threads")(delayed(f)(i ** 2) for i in range(n))

end_time = time.time()
print(end_time - start_time)

14.146161079406738


In [11]:
# 6 Jobs : means 6 cores or processor
from joblib import Parallel, delayed
start_time = time.time()

a = Parallel(n_jobs=6)(delayed(f)(i ** 2) for i in range(n))

end_time = time.time()
print(end_time - start_time)

4.2985992431640625
