In [1]:
from threading import Thread, Lock, current_thread
import concurrent.futures
import pandas as pd
import numpy as np
import time

### Thread: 
    - An entity within a process that can be scheduled (also known as "lightweight process")
    - A process can spawn multiple threads

#### Advantages:
    - All threads within a process share the same memory
    - lightweight
    - Starting a thread is faster than starting a process
    - Great for I/O-bound tasks


#### Disadvantages:
    - Threading is limited by GIL - only one thread at a time
    - No effect for COU-bound tasks
    - Not interruptable/killable
    - Careful with race conditions

### GIL: (global interpreter lock)
    - A lock that allows only one thread at a time to execute in Python 
    - Needed in CPython because memory management is not thread-safe

In [2]:
df = pd.DataFrame(columns=[*'abc'])
def random_func(arg):
    global df
    time.sleep(0.5)
    rand_int = np.random.randint(low=0, high=arg, size=3)
    rand_df = pd.DataFrame.from_records([rand_int], columns=[*'abc'])
    df = pd.concat([df, rand_df], ignore_index=True)

In [3]:
num_threads = 10
threads = []

for num in range(1, num_threads + 1):
    thread = Thread(target=random_func, args=[num])
    threads.append(thread)

# start threads
for thread in threads:
    thread.start()

# join threads (wait for all threads to complete)
for thread in threads:
    thread.join()

df

Unnamed: 0,a,b,c
0,0,1,0
1,0,0,0
2,4,3,0
3,6,1,1
4,1,0,2
5,0,0,2
6,0,4,2
7,2,0,0
8,4,4,4
9,0,0,8


In [4]:
df = pd.DataFrame(columns=[*'abc'])
def random_func(arg):
    global df
    time.sleep(1)
    rand_int = np.random.randint(low=0, high=arg, size=3)
    rand_df = pd.DataFrame.from_records([rand_int], columns=[*'abc'])
    df = pd.concat([df, rand_df], ignore_index=True)
    

In [5]:
my_list = list(range(100))
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for item in my_list:
        executor.submit(random_func, item)
end = time.perf_counter()
print(end-start)
df.head(5)

10.113954800006468


Unnamed: 0,a,b,c
0,0,0,0
1,0,1,0
2,3,3,2
3,2,2,2
4,1,4,3


In [6]:
my_list = list(range(100))
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(random_func, my_list)
end = time.perf_counter()
print(end-start)
df.head(5)

10.051498999993782


Unnamed: 0,a,b,c
0,0,0,0
1,0,1,0
2,3,3,2
3,2,2,2
4,1,4,3


### Race conditions

In [7]:
db_value = 0

def increase_db_value():
    global db_value

    local_copy = db_value
    # processing
    local_copy += 1
    time.sleep(0.1)
    db_value = local_copy

print('start value: ', db_value)
thread1 = Thread(target=increase_db_value)
thread2 = Thread(target=increase_db_value)

thread1.start()
thread2.start()

thread1.join()
thread2.join()

print('end value: ', db_value)

start value:  0
end value:  1


#### Using lock to mitigate race conditions

In [8]:

db_value = 0

def increase_db_value(lock):
    global db_value

    lock.acquire()
    local_copy = db_value
    # processing
    local_copy += 1
    time.sleep(0.1)
    db_value = local_copy
    lock.release()

# we can use a context manager instead
def increase_db_value(lock):
    global db_value

    with lock:
        local_copy = db_value
        # processing
        local_copy += 1
        time.sleep(0.1)
        db_value = local_copy

lock = Lock()
print('start value: ', db_value)
thread1 = Thread(target=increase_db_value, args=(lock,))
thread2 = Thread(target=increase_db_value, args=(lock,))

thread1.start()
thread2.start()

thread1.join()
thread2.join()

print('end value: ', db_value)


start value:  0
end value:  2


### Using Queue

In [9]:
from queue import Queue

In [16]:
def worker(q, lock):
    while True:
        value = q.get()
        # processing
        with lock:
            print(f'{current_thread().name}: {value}')
        q.task_done()

q = Queue()
lock = Lock()

num_threads = 5

for i in range(num_threads):
    thread = Thread(target=worker, args=(q, lock))
    # a daemon thread is a background thread that dies when the main thread dies
    # this is how the program is able to end even though there is an infinite loop
    # in the worker function. 
    thread.daemon = True
    thread.start()

for i in range(1, 21):
    q.put(i)

q.join()

Thread-59: 1, 86007.3138789
Thread-59: 2, 86007.3199749
Thread-60: 3, 86007.3212904
Thread-62: 4, 86007.3258112
Thread-58: 5, 86007.3302347
Thread-61: 6, 86007.3331677
Thread-59: 7, 86007.3376488
Thread-60: 8, 86007.3379195
Thread-62: 9, 86007.3382379
Thread-58: 10, 86007.3386254
Thread-61: 11, 86007.3388445
Thread-59: 12, 86007.3391299
Thread-60: 13, 86007.3394554
Thread-62: 14, 86007.3397468
Thread-58: 15, 86007.3401143
Thread-61: 16, 86007.340393
Thread-59: 17, 86007.3405902
Thread-60: 18, 86007.3408806
Thread-62: 19, 86007.3411556
Thread-58: 20, 86007.3414454
