In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
path_in = '../input/santa-2019-revenge-of-the-accountants/'
print(os.listdir(path_in))

In [None]:
#Reference: https://www.kaggle.com/xhlulu/santa-s-2019-faster-cost-function-24-s 
from itertools import product
from time import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from numba import njit, prange
import matplotlib.pyplot as plt
def _build_choice_array(data, n_days):
    choice_matrix = data.loc[:, 'choice_0': 'choice_9'].values
    choice_array_num = np.full((data.shape[0], n_days + 1), -1)

    for i, choice in enumerate(choice_matrix):
        for d, day in enumerate(choice):
            choice_array_num[i, day] = d
    
    return choice_array_num


def _precompute_accounting(max_day_count, max_diff):
    accounting_matrix = np.zeros((max_day_count+1, max_diff+1))
    # Start day count at 1 in order to avoid division by 0
    for today_count in range(1, max_day_count+1):
        for diff in range(max_diff+1):
            for j in range (1, 6):
                accounting_cost = (today_count - 125.0) / 400.0 * (today_count**(0.5 + diff / 50.0))/j**2
                accounting_matrix[today_count, diff] = max(0, accounting_cost)
    
    return accounting_matrix


def _precompute_penalties(choice_array_num, family_size):
    penalties_array = np.array([
        [
            0,
            50,
            50 + 9 * n,
            100 + 9 * n,
            200 + 9 * n,
            200 + 18 * n,
            300 + 18 * n,
            300 + 36 * n,
            400 + 36 * n,
            500 + 36 * n + 199 * n,
            500 + 36 * n + 398 * n
        ]
        for n in range(family_size.max() + 1)
    ])
    
    penalty_matrix = np.zeros(choice_array_num.shape)
    N = family_size.shape[0]
    for i in range(N):
        choice = choice_array_num[i]
        n = family_size[i]
        
        for j in range(penalty_matrix.shape[1]):
            penalty_matrix[i, j] = penalties_array[n, choice[j]]
    
    return penalty_matrix


@njit
def _compute_cost_fast(prediction, family_size, days_array, 
                       penalty_matrix, accounting_matrix, 
                       MAX_OCCUPANCY, MIN_OCCUPANCY, N_DAYS):

    N = family_size.shape[0]

    daily_occupancy = np.zeros(len(days_array)+1, dtype=np.int64)
    penalty = 0
    
    for i in range(N):
        n = family_size[i]
        d = prediction[i]
        
        daily_occupancy[d] += n
        penalty += penalty_matrix[i, d]

    relevant_occupancy = daily_occupancy[1:]
    incorrect_occupancy = np.any(
        (relevant_occupancy > MAX_OCCUPANCY) | 
        (relevant_occupancy < MIN_OCCUPANCY)
    )
    
    penalty = 100000000


    init_occupancy = daily_occupancy[days_array[0]]
    accounting_cost = (init_occupancy - 125.0) / 400.0 * init_occupancy**(0.5)

    accounting_cost = max(0, accounting_cost)
    

    yesterday_count = init_occupancy
    for day in days_array[1:]:
        today_count = daily_occupancy[day]
        diff = abs(today_count - yesterday_count)
        accounting_cost += accounting_matrix[today_count, diff]
        yesterday_count = today_count

    return penalty, accounting_cost, daily_occupancy


def build_cost_function(data, N_DAYS=100, MAX_OCCUPANCY=300, MIN_OCCUPANCY=125):

    family_size = data.n_people.values
    days_array = np.arange(N_DAYS, 0, -1)


    choice_array_num = _build_choice_array(data, N_DAYS)
    penalty_matrix = _precompute_penalties(choice_array_num, family_size)
    accounting_matrix = _precompute_accounting(max_day_count=MAX_OCCUPANCY, max_diff=MAX_OCCUPANCY)
    

    def cost_function(prediction):
        penalty, accounting_cost, daily_occupancy = _compute_cost_fast(
            prediction=prediction,
            family_size=family_size, 
            days_array=days_array, 
            penalty_matrix=penalty_matrix, 
            accounting_matrix=accounting_matrix,
            MAX_OCCUPANCY=MAX_OCCUPANCY,
            MIN_OCCUPANCY=MIN_OCCUPANCY,
            N_DAYS=N_DAYS
        )
        
        return penalty + accounting_cost
    
    return cost_function



#reference: https://www.kaggle.com/xhlulu/santa-s-2019-stochastic-product-search
base_path = '/kaggle/input/santa-2019-revenge-of-the-accountants/'
sub_path = '/kaggle/vipito/santa-ip'
data = pd.read_csv(base_path + 'family_data.csv', index_col='family_id')
submission = pd.read_csv(base_path + 'sample_submission.csv', index_col='family_id')

cost_function = build_cost_function(data)
original = submission['assigned_day'].values
original_score = cost_function(original)

%timeit cost_function(original)

#Reference: https://www.kaggle.com/xhlulu/santa-s-2019-stochastic-product-search
def stochastic_product_search(top_k, fam_size, original, choice_matrix, 
                              disable_tqdm=False, verbose=10000,
                              n_iter=2000, random_state=2019):
    best = original.copy()
    best_score = cost_function(best)
    
    np.random.seed(random_state)

    for i in tqdm(range(n_iter), disable=disable_tqdm):
        t1 = time()
        time_array.append(t1 - start_time)
        fam_indices = np.random.choice(range(choice_matrix.shape[0]), size=fam_size)
        changes = np.array(list(product(*choice_matrix[fam_indices, :top_k].tolist())))

        for change in changes:
            new = best.copy()
            new[fam_indices] = change

            new_score = cost_function(new)

            if new_score < best_score:
                best_score = new_score
                best = new
        
        if new_score < best_score:
            best_score = new_score
            best = new
    
        if verbose and i % verbose == 0:
            print(f"Iteration #{i}: Best score is {best_score:.2f}")
    
    print(f"Final best score is {best_score:.2f}")
    return best

choice_matrix = data.loc[:, 'choice_0': 'choice_9'].values

time_array = []
start_time = time()
print("Starting Stochastic product search now")
best = stochastic_product_search(
    choice_matrix=choice_matrix, 
    top_k=5,
    fam_size=5, 
    original=original, 
    n_iter=2000,
    disable_tqdm=False,
    verbose=2
)
end_time = time()
total_time = end_time-start_time
time_array.append(total_time)
bars = range(1, len(time_array) + 1)
iterations = np.arange(len(bars))

plt.plot(iterations, time_array)

plt.title('Stochastic Product Search Time Graph')
plt.xlabel('No. of Iterations')
plt.ylabel('Time (Sec)')

plt.show()
print(f"The stotchastic algorithm took", total_time, "seconds ")

In [None]:
from itertools import product
from time import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from numba import njit, prange
import matplotlib.pyplot as plt
base_path = '/kaggle/input/santa-2019-revenge-of-the-accountants/'
sub_path = '/kaggle/vipito/santa-ip'
data = pd.read_csv(base_path + 'family_data.csv', index_col='family_id')
submission = pd.read_csv(base_path + 'sample_submission.csv', index_col='family_id')
original = data['n_people'].values
def inplace_quick_sort(S, a, b):
  if a >= b: return                                     
  pivot = S[b]                                           
  left = a                                              
  right = b-1                                            
  while left <= right:
    t1 = time()
    time_array.append(t1 - start_time)
    while left <= right and S[left] < pivot:
      left += 1
    while left <= right and pivot < S[right]:
      right -= 1
    if left <= right:                                    
      S[left], S[right] = S[right], S[left]              
      left, right = left + 1, right - 1                  
  S[left], S[b] = S[b], S[left]
  inplace_quick_sort(S, a, left - 1)
  inplace_quick_sort(S, left + 1, b)
  
    


start_time = time()
time_array = []
print("Quicksort time is starting now")
inplace_quick_sort(original,0,len(original)-1)
end_time = time()
total_time = end_time-start_time
time_array.append(total_time)
print(f"The QuickSort algorithm took", total_time, "seconds")


bars = range(1, len(time_array) + 1)
iterations = np.arange(len(bars))
plt.plot(iterations, time_array)

plt.title('QuickSort Time Graph')
plt.xlabel('No. of Iterations')
plt.ylabel('Time (Sec)')

plt.show()


Reference: Data Structures and Algorithms in Python, page 550-556, chapter 12.3: Quick-Sort


**Quicksort:** QuickSort is a randomized sorting algorithm, which sorts a sequence using a simple recursive approach. It is based on the divide-and-conquer paradigm, where the sequence is divided into subsequences, which are then recurred to sort each subsequence and then the subsequences are combined by a concatenation. Quicksort can sort items of any type for which a "less-than" relation is defined. The algorithm consists of the following three steps:
1.	Divide: Only done if the sequence has at least two elements, if the sequence has one or no elements nothing needs to be done. A specific element x from the sequence must be selected, which is called the pivot. Common practice is to select the pivot x to be the last element in the sequence. All of the elements must then be removed from the sequence and put into three sequences: 
L, storing the elements in the sequence less than x
E, Storing the elements in the sequence equal to x
G, storing the elements in the sequence greater than x 
If the elements of the sequence are distinct, then E will only hold the pivot

2.	Conquer: The sequences L and G are recursively sorted.
3.	Combine: Put the elements back into the sequence in order by first inserting the elements of L, then E and finally G

**Boundary Cases**
The time taken by QuickSort can be written as:
T(n) = T(k) + T(n-k-1) + O(n)
Where the first two terms are referring to the recursive calls. k = the number of elements smaller than the pivot.

**Best Case:**
The best case occurs when the partition process picks the middle element as pivot.
The time can be written as T(n) = 2T(n/2) + O(n)
This provides us a time complexity of O(nlogn)

**Worst Case:**
This occurs when the partition picks the greatest or smallest element as a pivot.
For example, if the last element is always chosen as the pivot, the worst case would occur is the array is already sorted.
This can be written as T(n) = T(0) + T(n-1) + O(n)
This provides us a time complexity of O(n^2)

**Average Case:**
We can get an idea of the average array by considering the case where the partition puts O(n/9) elements in a single set and O(9n/10) elements in other set.
This can be written as T(n) = T(n/9) + T(9n/10) + O(n)
This provides us a time complexity of O(nLogn)

In this case, the algorithm seems to be running in O(nLogn) time, as indicated by the graph. While the graph initially may look linear, there is a slight curvature around the 2000th iteration, which slightly resembles an O(nLogn) graph. If the algorithm took longer than 0.09 seconds this curvature would likely be more visible. 


Reference: https://www.tutorialsteacher.com/python/python-data-types

A sequence is an ordered collection of similar or different data types. Python has multiple built-in sequence data types, which are:

String, List and Tuple. In the case of the quick-sort algorithm a list is used. A list is an ordered colection of one or more data items, they do not have to be the same type and are put in square brackets.

Stochastic Search Algorithm

Reference: https://www.sciencedirect.com/topics/engineering/stochastic-search-algorithm

Stochastic search algorithms are typically created for problems that have inherent random noise or deterministic issues, which can be solved by injected randomness. This algorithm works by sampling a small number of families. The families top-k choices are considered and the caresian product of those choices is found. This gives k^f possible changes that can be made to the current assignment. For each proposed change a new assignment is created by updating the current best assignment, if the new score is better than the current best score, both of the values will be updated. This is repeated by the number of iterations which is selected, given by the parameter n_iter. As the algorithm utilises a nested loop it’s worst case scenario will be a running time of O(n^2), however in the best case the algorithm will run in an O(n) time. The algorithm takes 219 seconds to run (note this time does varry slighlty each time I commit this code), although it is running 2000 times 219 / 2000 = 0.1095, meaning the algorithm is running on average at 0.1095 seconds, which indicates it is running in O(n) time. The time graph supports this, with a linear increase with respect to the number of iterations.

Matrix

Reference: https://www.careerride.com/Data-structure-matrix-and-its-uses.aspx

A matrix is used to store data in an organized way in a rows and columns structure to persist homogeneous data. Python does not have a built in type for matricies, however in this case a list of lists is used and treated as a matrix.