# 1. Import & Load & Extract

In [None]:
import numpy as np
import pandas as pd
pd.set_option("max_colwidth", 80)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import copy
import csv

In [None]:
def encode_symbols(data, symbols):
    """ \o/ """
    if isinstance(data, pd.Series) and isinstance(symbols, dict):

        data = data.copy()

        for key, value in symbols.items():
            data = data.str.replace(value, str(key), regex=True)    

    return data


def decode_symbols(data, symbols):
    """ \o/ """
    if isinstance(data, pd.Series) and isinstance(symbols, dict):

        data = data.copy()

        for key, value in symbols.items():
            data = data.str.replace(str(key), value, regex=True)    

    return data


def data_info(data, n_samples=None):
    """ \o/ """
    if not isinstance(data, pd.Series):
        return False
    
    print("\n ***** Info about data *****\n")

    if data.any():
        sum_len = len(data.sum())
    else:
        sum_len = 0
    print("size:    ", data.size)
    print("sum+len: ", sum_len)
    
    result = data.str.len().value_counts()
    print("\nlen + value_counts:")
    print(result.to_string())

    print()
    
    if n_samples and isinstance(n_samples, int) and n_samples > 0:
        print("=== samples({}) ===".format(n_samples))
        result = data.sample(n_samples, random_state=42)
    else:
        print("=== head()+tail() ===")
        result = data.head().append(data.tail())

    print("index - value")
    print(result.to_string())
    print()

In [None]:
path_to_sample = "../input/santa-2021/sample_submission.csv"
path_to_distance = "../input/santa-2021/distance_matrix.csv"
path_to_wildcards = "../input/santa-2021/wildcards.csv"
path_to_permutations = "../input/santa-2021/permutations.csv"

permutations_data = pd.read_csv(path_to_permutations)
permutations_data.head()

In [None]:
symbols = {
    0: '🌟',  # wildcard
    1: '🎅',  # first start combination
    2: '🤶',  # second start combination
    3: '🦌',
    4: '🧝',
    5: '🎄',
    6: '🎁',
    7: '🎀',
}

In [None]:
all_combinations = encode_symbols(permutations_data['Permutation'], symbols)
all_combinations.head()

# 2. Solution

In [None]:
def _join_by_slice_len(data, value, slice_len):
    """ \o/ """
    result = value

    while data.str.startswith(result[-slice_len:]).any():
        
        check_slice = data.str.startswith(result[-slice_len:])
        indices_list = data[check_slice].index.to_list()

        cutted_value = data.pop(indices_list[0])
        result = result[:-slice_len] + cutted_value

    return result


def join_by_one(value, data, is_copy=True, search_type='contains'):
    """ \o/ """
    if not isinstance(value, str):
        return False

    if not isinstance(data, pd.Series):
        return False
    
    if is_copy == True:
        data = data.copy()
    
    if search_type == "contains":
        found_mask = data.str.contains(value)
    elif search_type == "startswith":
        found_mask = data.str.startswith(value)
    else:
        return False
    
    if not found_mask.any():
        return None
    else:
        indices_list = data[found_mask].index.to_list()
        result = data.pop(indices_list[0])

    if not data.any():
        return result
    
    slice_len = len(result) - 1
    
    if data.str.startswith(result[-slice_len:]).any():
        result = _join_by_slice_len(data, result, slice_len)

    return result


## 2.1 Combinations start

In [None]:
pattern_start = "12"
mask_start = all_combinations.str.startswith(pattern_start)
combs_start = all_combinations[mask_start].copy().reset_index(drop=True)

In [None]:
data_info(combs_start)

## 2.2 Combinations middle

In [None]:
pattern_middle = r"\d*12\d*|^2[3-7]\d{4}1"
mask_middle = all_combinations.str.match(pattern_middle)

combs_middle = all_combinations[mask_middle].copy().reset_index(drop=True)

In [None]:
data_info(combs_middle)

In [None]:
%%time
slice_len = 4
joined_combs_data = []

for x in combs_start:
    joined_combs_data.append(join_by_one(x[-slice_len:],
                                         combs_middle.copy(),
                                         is_copy=False,
                                         search_type='startswith'))

joined_combs_middle = pd.Series(joined_combs_data) 

In [None]:
data_info(joined_combs_middle)

## 2.3 Combinations end

In [None]:
combs_end = all_combinations[~mask_middle].copy().reset_index(drop=True)

In [None]:
data_info(combs_end)

In [None]:
%%time
join_combs_end = combs_end.copy()

slice_len = 5
joined_combs_data = []

for value in combs_start:

    # First step
    if not join_combs_end.any():
        continue        
        
    combs = join_by_one(value[-slice_len:],
                        join_combs_end,
                        is_copy=False,
                        search_type='startswith')
    
    if not combs:
        continue        
        
    # Second step / 7-step loop /
    for _ in range(7):
        if join_combs_end.any():
            result = join_by_one(combs[-slice_len:],
                                 join_combs_end,
                                 is_copy=False,
                                 search_type='startswith')
            if result:
                combs = combs[:-slice_len] + result
        else:
            break
            
    joined_combs_data.append(combs)

print("Size 'join_combs_end' after: {}\n".format(join_combs_end.size))

joined_combs_end = pd.Series(joined_combs_data)

In [None]:
data_info(joined_combs_end)

## 2.4 EDA

In [None]:
def get_table_data(data):
    """ \o/ """
    if isinstance(data, pd.Series):
        
        max_len = data.str.len().max()
        concat_columns = []

        for x in range(max_len):
            x_data = data.str.get(x).copy()
            x_data.name = "P" + str(x)
            concat_columns.append(x_data)

        data = pd.concat(concat_columns, axis=1)
        
    return data


def get_melt_data(data):
    """ \o/ """
    if isinstance(data, pd.Series):
        
        melt_value = "Value"
        melt_index = "Position"
        
        max_len = data.str.len().max()
        concat_index = []

        for x in range(max_len):
            x_data = data.str.get(x).copy()
            x_data.name = melt_value

            x_df = pd.DataFrame(x_data)
            x_df[melt_index] = "P" + str(x)

            concat_index.append(x_df)

        data = pd.concat(concat_index, axis=0) \
                        .reset_index(drop=True)
        
    return data


def plot_melt_data(data, figsize=(12, 8), orient='h'):
    """ \o/ """
    if isinstance(data, pd.DataFrame) and (len(data.columns) == 2):
        cols_list = data.columns.to_list()
        
        orient_list = ['h', 'v']
        if orient not in orient_list:
            return "orient={}".format(orient_list)
        
        plt.figure(figsize=figsize)
        
        if orient == 'h':
            sns.histplot(y=cols_list[1], hue=cols_list[0], data=data,
                         multiple='stack', shrink=.75)
            plt.xlabel("")

        if orient == 'v':
            sns.histplot(x=cols_list[1], hue=cols_list[0], data=data,
                         multiple='stack', shrink=.75)
            plt.ylabel("")
            
        plt.show()

In [None]:
joined_data = {'Start': combs_start,
               'Middle': joined_combs_middle,
               'End': joined_combs_end}

data = pd.DataFrame(joined_data)

In [None]:
data

In [None]:
middle_table = get_table_data(data['Middle'])
middle_table.head()

In [None]:
middle_melt_data = get_melt_data(data['Middle'])
plot_melt_data(middle_melt_data, orient='v')

In [None]:
end_melt_data = get_melt_data(data['End'])
plot_melt_data(end_melt_data, orient='v')

## Solution 2440

In [None]:
def join_left(first_combs, second_combs, slice_len, is_copy=False, specific_len=None):
    """ \o/ """
    if not isinstance(first_combs, pd.Series):
        return False

    if not isinstance(second_combs, pd.Series):
        return False
    
    if is_copy == True:
        first_combs = first_combs.copy()
        second_combs = second_combs.copy()
    
    for lft, lft_value in first_combs.items():
        
        for rgt, rgt_value in second_combs.items():
            
            if specific_len and len(rgt_value) != specific_len:
                continue
                
            if lft_value[-slice_len:] == rgt_value[:slice_len]:
                cutted_value = second_combs.pop(rgt)
                first_combs[lft] = lft_value[:-slice_len] + cutted_value
                break
    
    return True

In [None]:
%%time
x_end = joined_combs_end.copy()
x_middle = joined_combs_middle.copy()
join_left(x_end, x_middle, 4)

In [None]:
x_end = "12" + x_end

In [None]:
data_info(x_end)

In [None]:
slice_len = 4
data_result = []
from_combs = data['Middle']
check_combs = data['End']

for value in check_combs.values:
    indices_list = from_combs[from_combs.str.startswith(value[-slice_len:])].index.to_list()
    if indices_list:
        result = indices_list[0]
    else:
        result = None
        
    data_result.append(result)
    
intersection_info = pd.Series(data_result, name='middle').reset_index()
intersection_info = intersection_info.rename(columns={'index': 'end'})

intersection_info.head(5)

In [None]:
inter_data = intersection_info.copy()
inter_info = []
inter_info_sorted = []
for i, value in inter_data['end'].items():
    i_result = [value]
    x = value
    while True:
        found_value = inter_data.loc[value, 'middle']
        if found_value == x:
            break
        else:
            i_result.append(found_value)            
            value = inter_data.loc[found_value , 'end']
    
    inter_info.append(i_result)
    inter_info_sorted.append(sorted(i_result))

In [None]:
inter_info_unique = []
inter_info_unique_sorted = []
for i, x_inter in enumerate(inter_info_sorted):
    if x_inter not in inter_info_unique_sorted:
        inter_info_unique_sorted.append(x_inter)
        inter_info_unique.append(inter_info[i])
        
len(inter_info_unique)

In [None]:
intersection_position = []
for x_inices in inter_info_unique:
    intersection_position.extend(x_inices)
    
len(intersection_position)

In [None]:
n_line = 3
lines_list = [x for x in range(n_line)]

lines_frequency = len(intersection_position) / n_line   # 120 / 3 * 40
intersection_lines = lines_list * int(lines_frequency)  # [0, 1, 2] * 40

len(intersection_lines)

In [None]:
marked_position = pd.DataFrame({'Lines': intersection_lines,
                                'Indices': intersection_position})

marked_position[marked_position['Lines'] == 0].head()

In [None]:
data_dict = {}
n_line = 3
lines = [x for x in range(n_line)]
choise_data = x_end.copy()
start_data = combs_start.copy()

for x_line in lines:
    x_indices = marked_position.loc[marked_position['Lines'] == x_line, 'Indices'].to_list()
    x_data = choise_data[x_indices].copy().reset_index(drop=True)
    x_main_combs = start_data

    selected_main_combs = []
    for value in x_main_combs.values:
        if not x_data.str.contains(value).any():
            selected_main_combs.append(value)

    data_dict[x_line] = {
        'main': pd.Series(selected_main_combs),
        'choise': x_data
    }

max_len = max([data_dict.get(x_line).get('main').size for x_line in lines])
[data_dict.get(x_line).get('main').size for x_line in lines]

In [None]:
data_dict.keys()

In [None]:
data_info(data_dict.get(0).get('main'))

In [None]:
data_info(data_dict.get(0).get('choise'))

In [None]:
data_dict_copy = copy.deepcopy(data_dict)

In [None]:
%%time

slice_len = 2

for x_line in data_dict.keys():
    x_start = data_dict.get(x_line).get('main')
    x_end = data_dict.get(x_line).get('choise')
    
    join_left(x_end, x_start, slice_len)

In [None]:
data_info(data_dict.get(0).get('main'))

In [None]:
data_info(data_dict.get(0).get('choise'))

## 2.6 Check solution

In [None]:
def get_result(data_dict):
    """ \o/ """
    key_name = "choise"
    
    result = {key: value.get(key_name).sum() for key, value in data_dict.items()}

    return pd.Series(result, name='schedule')


def base_combinations_correct(combs, result):
    """ \o/ """
    check_result = []
    
    for x_combination in combs:
        check_result.append(result.str.contains(x_combination).all())
        
    return pd.Series(check_result).all()


def other_combinations_correct(combs, result):
    """ \o/ """
    check_result = []
    
    for x_combination in combs:
        check_result.append(result.str.contains(x_combination).any())
        
    return pd.Series(check_result).all()

In [None]:
result = get_result(data_dict)
result

In [None]:
other_combinations_correct(all_combinations, result)

In [None]:
base_combinations_correct(combs_start, result)

In [None]:
result.str.len()

# 3. Save combinations

In [None]:
all_lines_data = []
for x_line in data_dict_copy.keys():
    x_cols_data = data_dict_copy.get(x_line).get('choise')
    x_col_1 = x_cols_data.str.slice(stop=7)
    x_col_2 = x_cols_data.str.slice(start=2)
    x_col_3 = data_dict_copy.get(x_line).get('main')
    x_line_data = pd.DataFrame({'line': x_line + 1,
                                'head': x_col_1,
                                'body': x_col_2,
                                'tail': x_col_3})
    
    all_lines_data.append(x_line_data)
    
dataset = pd.concat(all_lines_data, axis=0, ignore_index=True)
dataset

In [None]:
save_dataset = True

if save_dataset:
    dataset_symbols = dataset.copy()

    for col_name in dataset_symbols.columns:
        if col_name == 'line':
            continue
        
        dataset_symbols[col_name] = decode_symbols(dataset_symbols[col_name],
                                                   symbols)
        
    dataset_symbols.to_csv('combinations.csv', index=False)
    
print(save_dataset)

# 4. Save submission

In [None]:
save_submission = True

if save_submission:
    submission = result.copy()
    
    submission = decode_symbols(submission, symbols)
    
    submission.to_csv('submission.csv', index=False)
    
print(save_submission)