* Visualization of these concepts: https://visualgo.net/en

* https://www.geeksforgeeks.org/ - many implementations

In [None]:
# Selection sort was
# starting from one end or our sequence
# select min or max out of unsorted and place in the sequence
# keep going backwards or forwards
# problem was that min (or max) call would be linear
# min or max call would need to go through all the unsorted values each time
# what if we had a data structure which lets us get the min or max in better than linear time?
# turns out there is such a structure - heap

## Heap sort
* Selection sort where we utilize a better structure for storing max(or min values)
* https://www.cs.usfca.edu/~galles/visualization/Heap.html

* https://www.cs.usfca.edu/~galles/visualization/HeapSort.html

In [None]:
## Heap sort is a selection algorithm
# we go through our iterable and select the min or max value out of the unsorted
# the only improvement is that we use a more efficient data structure for storing the min or max values(we have to pick side)
# the problem was that we had to go through all the unsorted values constantly and look for that min or max
# that's where O(n^2) comes from

In [None]:
## https://www.cs.usfca.edu/~galles/visualization/Heap.html - heap data structure visualization

In [1]:
import heapq #we are using an existing library of heap data structure
def simple_heap_sort(iterable):
    heapq.heapify(iterable) # guaranteed linear time by the library but it is IN PLACE
    # even if heapify was O(n log n) it would still be good for our purpose
    return [heapq.heappop(iterable) for i in range(len(iterable))] # so no IndexErrors 
# heappop call is O(log n) so we get our O(n log n)
# complexity comes from the single [heapq.heappop(iterable) for i in range(len(iterable))]

In [2]:
simple_heap_sort([1,34,6,21,6,1,21,656,6,2,7,0,-33,-2,5])

[-33, -2, 0, 1, 1, 2, 5, 6, 6, 6, 7, 21, 21, 34, 656]

In [None]:
# so our simple heap sort algorithms nicely but in the process of heapify IN PLACE we destroy the original order

In [3]:
import random
r10k = [random.randint(1,1_000_000) for n in range(10_000)]
r100k = [random.randint(1,1_000_000) for n in range(100_000)]
r1m = [random.randint(1,10_000_000) for n in range(1_000_000)]

In [4]:
# we could use already provided heap data structure in many languages
# in Python https://docs.python.org/3/library/heapq.html
# import heapq
def heapsort(iterable):
    h = [] # we are going to put our heap in a new structure
    # so making our heap will be O(n log n) which is fine for our purpose
    for value in iterable:
         heapq.heappush(h, value)
    return [heapq.heappop(h) for i in range(len(h))]

In [5]:
heapsort([1,3,6,21,2,3,67,-3,7])

[-3, 1, 2, 3, 3, 6, 7, 21, 67]

In [6]:
nlist = heapsort(r100k)
nlist[:10]

[12, 14, 19, 20, 21, 23, 26, 32, 37, 63]

In [7]:
r100k[:5] # so our original numbers are still unsorted before running our timeit

[28582, 895931, 842170, 419497, 623143]

In [8]:
%%timeit
heapsort(r10k)

6.25 ms ± 96.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%%timeit
sorted(r10k)

1.67 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
%%timeit
heapsort(r100k)

120 ms ± 27.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%%timeit
sorted(r100k)

33.9 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%%timeit
heapsort(r1m)

2.24 s ± 193 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
sorted(r1m)

392 ms ± 6.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
sorted_1m = heapsort(r1m)
sorted_1m[:5], sorted_1m[-5:]

([5, 11, 32, 35, 37], [9999942, 9999950, 9999960, 9999960, 9999973])

In [15]:
%%timeit
heapsort(sorted_1m)

1.37 s ± 245 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
sorted(sorted_1m)
# bubble sort in this best case should be close to this...

107 ms ± 5.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
sorted(r10k) # timsort which is mergesort plus (insertion sort for < 50 items)

1.58 ms ± 6.18 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%%timeit
sorted(r100k) # timsort which is mergesort plus (insertion sort for < 50 items)

24.7 ms ± 1.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
sorted(r1m)

385 ms ± 18.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# Python program for implementation of heap Sort - GeeksForGeeks Version
 
# To heapify subtree rooted at index i.
# n is size of heap
 
 
def heapify(arr, n, i):
    largest = i  # Initialize largest as root
    l = 2 * i + 1     # left = 2*i + 1
    r = 2 * i + 2     # right = 2*i + 2
 
    # See if left child of root exists and is
    # greater than root
    if l < n and arr[largest] < arr[l]:
        largest = l
 
    # See if right child of root exists and is
    # greater than root
    if r < n and arr[largest] < arr[r]:
        largest = r
 
    # Change root, if needed
    if largest != i:
        arr[i], arr[largest] = arr[largest], arr[i]  # swap
 
        # Heapify the root.
        heapify(arr, n, largest)
 
# The main function to sort an array of given size
 
 
def heap_sort_geeks(arr):
    n = len(arr)
 
    # Build a maxheap.
    for i in range(n//2 - 1, -1, -1):
        heapify(arr, n, i)
 
    # One by one extract elements
    for i in range(n-1, 0, -1):
        arr[i], arr[0] = arr[0], arr[i]  # swap
        heapify(arr, i, 0)
 
 
# Driver code
arr = [12, 11, 13, 5, 6, 7]
heap_sort_geeks(arr)
n = len(arr)
print("Sorted array is")
for i in range(n):
    print("%d" % arr[i]),
# This code is contributed by Mohit Kumra

Sorted array is
5
6
7
11
12
13


In [None]:
%%timeit
heap_sort_geeks(r10k)

147 ms ± 2.89 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
r10k[:10]

[193, 358, 359, 364, 374, 391, 516, 578, 587, 604]

In [None]:
%%timeit
heap_sort_geeks(r10k)

152 ms ± 6.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%timeit
heap_sort_geeks(r100k)

1.9 s ± 53.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
heap_sort_geeks(r100k)

1.84 s ± 8.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# so one takeway is unless you have a serious need use existing data structures (collections) whenever possible
# your own data structures are likely to be slower
# of course if you come up with an improvement\
# that's probably masters thesis worth or even PhD!
# the low hanging fruit has already been picked but you never know

![Heap sort](https://upload.wikimedia.org/wikipedia/commons/1/1b/Sorting_heapsort_anim.gif)

# Quick Sort

![QuickSort](https://upload.wikimedia.org/wikipedia/commons/6/6a/Sorting_quicksort_anim.gif)

#### Hungarian Dance version - https://www.youtube.com/watch?v=ywWBy6J5gz8

In [None]:
# https://en.wikipedia.org/wiki/Tony_Hoare inventor of quicksort in early 1960s

# So quick sort algorithm
# choose a pivot (some value)
# partion values - those smaller go left, those bigger go right
# then apply quicksort to these subdivisions recursively

#eventually there are nothing to partition and we are done!

In [17]:
def naive_quicksort(it):
    if len(it) <= 1: # it is possible to get an empty list/array
        return it # so this is our base case
    pivot = it[0] # so our pivot is the first tiem random would be even better, 
    # an adversary could generate a worst case data set
    # if my pivot selection is deterministic
    # so partitioning will be 2 linear runs and also will take extra memory unlike optimized quicksort
    left = [n for n in it if n < pivot]  # assuming no duplicates
    right = [n for n in it if n > pivot] # so i am creating new lists/array so not very space efficient
    return naive_quicksort(left) + [pivot] + naive_quicksort(right) 
# so after one sortie pivot is guaranteed to be in the correct place in our sequence/array
# so the Recurrence looks like T(n) = 2T(n/2) + n (well n/2 is not guaranteed so it could be worse)

In [18]:
naive_quicksort([54, 26, 93, 17, 77, 31, 44, 55, 20, -33, 13,-11])

[-33, -11, 13, 17, 20, 26, 31, 44, 54, 55, 77, 93]

In [19]:
r10k = [random.randint(1,1_000_000) for n in range(10_000)]
r100k = [random.randint(1,1_000_000) for n in range(100_000)]
r1m = [random.randint(1,10_000_000) for n in range(1_000_000)]

In [23]:
# sanity check if our date is truly random
r10k[:5], r100k[:5], r1m[:5]

([295029, 120896, 350277, 349363, 744818],
 [295311, 926829, 907424, 907176, 400722],
 [108325, 4830094, 6274241, 451786, 4323015])

In [22]:
%%timeit
naive_quicksort(r10k)

29.6 ms ± 812 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%%timeit
sorted(r10k)

1.65 ms ± 186 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [25]:
%%timeit
naive_quicksort(r100k)

388 ms ± 15.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%%timeit
naive_quicksort(r1m)

6.59 s ± 561 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


![quick](https://github.com/ValRCS/RTU_Algorithms_DIP321/blob/main/imgs/quicksort.png?raw=1)

$$Complexity: O(nlog(n))$$ $$Worst case : O(n^2)$$

In [27]:
def quickSort(alist):
    quickSortHelper(alist, 0, len(alist) - 1)


def quickSortHelper(alist, first, last):
    if first < last:

        splitpoint = partition(alist, first, last)

        quickSortHelper(alist, first, splitpoint - 1)
        quickSortHelper(alist, splitpoint + 1, last)


def partition(alist, first, last):
    pivotvalue = alist[first]

    leftmark = first + 1
    rightmark = last

    done = False
    while not done:

        while leftmark <= rightmark and alist[leftmark] <= pivotvalue:
            leftmark = leftmark + 1

        while alist[rightmark] >= pivotvalue and rightmark >= leftmark:
            rightmark = rightmark - 1

        if rightmark < leftmark:
            done = True
        else:
            temp = alist[leftmark]
            alist[leftmark] = alist[rightmark]
            alist[rightmark] = temp

    temp = alist[first]
    alist[first] = alist[rightmark]
    alist[rightmark] = temp

    return rightmark


alist = [54, 26, 93, 17, 77, 31, 44, 55, 20]
quickSort(alist)
print(alist)

[17, 20, 26, 31, 44, 54, 55, 77, 93]


In [None]:
# so quicksort reccurence would be
T(n) = 2T(n/2) + n # soT(n/2) would be average case 
# worst case would be with pivots at the wrong end (not middle)
T(n) = T(1) + T(n-1) + n # which lead so quadratic complexity
# this could in real life scenario if you were applying naive quicksort to reversely ordered list

In [None]:
# so we could pick 1st in array/list as pivot
# we could last in array/list as pivot
# we could middle item - less likely to be bad randomly but attacker could still make upa bad dataset
# best would be to choose pivot randomly - but generating random numbers is slowish
# hybrid/compromise
# pick first item, last item, and middle item
# out of those 3 you pick the median value

In [None]:
# Turns out it can be proven (see Cormen- CLRS) that n log n is the best we can do for sorting algorithms which involve
# comparisons

In [None]:
# well those are so called bucket sorts, you have counting sort, radix sort, bucket sort
# these will have O(n component + some k some other component) - k being something to do with data

In [28]:
# so if we know we will only have to sort numbers 0 to 9 then we can make a bucket for these numbers
r_digits = [random.randint(0,9) for _ in range(100)]
r_digits[:15]

[8, 6, 2, 7, 3, 8, 0, 7, 1, 3, 8, 1, 2, 4, 4]

In [34]:
sorted(r_digits)[:15]

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [30]:
# so in this case I will use buckets to store each occurence of digit
buckets = [0 for _ in range(10)]
buckets

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [36]:
def naive_digit_sort(dig_list):
    
    buckets = [0 for _ in range(10)] # again linear making of buckets
    # so linear complexity for this part
    for digit in dig_list:
        buckets[digit] += 1
    # then rebuilding also will be linear
    t = [[i]*n for i, n in enumerate(buckets)] # so list of lists
    flat_list = [item for sublist in t for item in sublist] #thank you Alex Martelli!
    return flat_list

In [40]:
sorted_digits = naive_digit_sort(r_digits)
sorted_digits[:15], sorted_digits[-20:]

([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
 [8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [None]:
# Python program for counting sort - GeeksForGeeks
 
# The main function that sort the given string arr[] in 
# alphabetical order
def countSort(arr):
 
    # The output character array that will have sorted arr
    output = [0 for i in range(len(arr))]
 
    # Create a count array to store count of inidividul
    # characters and initialize count array as 0
    count = [0 for i in range(256)]
    # so this works because we  are using English ASCII alphabet no Latvian, Russian or Chinese..
    # for extra full Unicode we'd need 100k + buckets
 
    # For storing the resulting answer since the 
    # string is immutable
    ans = ["" for _ in arr]
 
    # Store count of each character
    for i in arr:
        count[ord(i)] += 1
 
    # Change count[i] so that count[i] now contains actual
    # position of this character in output array
    for i in range(256):
        count[i] += count[i-1]
 
    # Build the output character array
    for i in range(len(arr)):
        output[count[ord(arr[i])]-1] = arr[i]
        count[ord(arr[i])] -= 1
 
    # Copy the output array to arr, so that arr now
    # contains sorted characters
    for i in range(len(arr)):
        ans[i] = output[i]
    return ans 
 
# Driver program to test above function
arr = "RTU rocks and sorting is not that bad"
ans = countSort(arr)
print("Sorted character array is % s" %("".join(ans)))
 
# This code is contributed by Nikhil Kumar Singh

Sorted character array is        RTUaaabcddghiiknnnooorrssstttt


In [None]:
ans = countSort("Valdis teaching at RTU")
"".join(ans)

'   RTUVaaacdeghiilnstt'

In [None]:
# https://en.wikipedia.org/wiki/Radix_sort
# Python program for implementation of Radix Sort 
# A function to do counting sort of arr[] according to 
# the digit represented by exp. 
  
def countingSort(arr, exp1): 
  
    n = len(arr) 
  
    # The output array elements that will have sorted arr 
    output = [0] * (n) 
  
    # initialize count array as 0 
    count = [0] * (10) 
  
    # Store count of occurrences in count[] 
    for i in range(0, n): 
        index = (arr[i] / exp1) 
        count[int(index % 10)] += 1
  
    # Change count[i] so that count[i] now contains actual 
    # position of this digit in output array 
    for i in range(1, 10): 
        count[i] += count[i - 1] 
  
    # Build the output array 
    i = n - 1
    while i >= 0: 
        index = (arr[i] / exp1) 
        output[count[int(index % 10)] - 1] = arr[i] 
        count[int(index % 10)] -= 1
        i -= 1
  
    # Copying the output array to arr[], 
    # so that arr now contains sorted numbers 
    i = 0
    for i in range(0, len(arr)): 
        arr[i] = output[i] 
  
# Method to do Radix Sort 
def radixSort(arr): 
  
    # Find the maximum number to know number of digits 
    max1 = max(arr) 
  
    # Do counting sort for every digit. Note that instead 
    # of passing digit number, exp is passed. exp is 10^i 
    # where i is current digit number 
    exp = 1
    while max1 / exp > 0: 
        countingSort(arr, exp) 
        exp *= 10
  
  
# Driver code 
arr = [170, 45, 75, 90, 802, 24, 2, 66] 
  
# Function Call 
radixSort(arr) 
  

print(arr)
  
# This code is contributed by Mohit Kumra 
# Edited by Patrick Gallagher 

[2, 24, 45, 66, 75, 90, 170, 802]


In [None]:
r1k = [random.randint(1,10_000) for n in range(1_000)]
r1k[:5]

[7765, 3180, 945, 557, 4201]

In [None]:
%%timeit
radixSort(r1k)

807 ms ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r1k[:5]

[2, 7, 11, 23, 35]

In [None]:
# Python program for counting sort 
# which takes negative numbers as well
# so we will need to move the buckets
 
# The function that sorts the given arr[]
def count_sort(arr):
    max_element = int(max(arr)) # these are linear operations
    min_element = int(min(arr))
    range_of_elements = max_element - min_element + 1
    # Create a count array to store count of individual
    # elements and initialize count array as 0
    count_arr = [0 for _ in range(range_of_elements)]
    output_arr = [0 for _ in range(len(arr))]
 
    # Store count of each character
    for i in range(0, len(arr)):
        count_arr[arr[i]-min_element] += 1
 
    # Change count_arr[i] so that count_arr[i] now contains actual
    # position of this element in output array
    for i in range(1, len(count_arr)):
        count_arr[i] += count_arr[i-1]
 
    # Build the output character array
    for i in range(len(arr)-1, -1, -1):
        output_arr[count_arr[arr[i] - min_element] - 1] = arr[i]
        count_arr[arr[i] - min_element] -= 1
 
    # Copy the output array to arr, so that arr now
    # contains sorted characters
    for i in range(0, len(arr)):
        arr[i] = output_arr[i]
 
    return arr
 
 
# Driver program to test above function
arr = [-5, -10, 0, -3, 8, 5, -1, 10]
ans = count_sort(arr)
print("Sorted character array is " + str(ans))

Sorted character array is [-10, -5, -3, -1, 0, 5, 8, 10]


In [None]:
%%timeit
count_sort(r1k)

4.28 ms ± 80.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
r1k[:5]

[2, 7, 11, 23, 35]

In [None]:
r10k[:5]

[197150, 507406, 183256, 765344, 236740]

In [None]:
%%timeit
count_sort(r10k)

366 ms ± 7.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r10k[:5]

[177, 274, 370, 530, 706]

In [None]:
random.shuffle(r10k)
r10k[:5]

[99241, 79321, 253639, 283493, 497741]

In [None]:
%%timeit
count_sort(r10k)

373 ms ± 20.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
r10k[:5]

[177, 274, 370, 530, 706]

In [None]:
# so Bucket, Counting and Radix sorts can theoretically beat O(n log n), but in practice they are less often used, 
# because they need extra space for the buckets
# and they have larger constants


In [None]:
# in practice current pragmatic and practial algorithm is Timsort
# https://en.wikipedia.org/wiki/Timsort
# which uses insertion sort for small number of items and merge sort for larger collections

In [None]:
# there are some improvements to constants using truth tables
# presorting small amounts of data
# https://github.com/scandum/quadsort

In [None]:
# so stability is another criterion when choosing
# https://www.geeksforgeeks.org/stable-and-unstable-sorting-algorithms/