# Preparing the data

## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm, tqdm_notebook
import time

## Load Data

In [2]:
# Read train, test and submission files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
sub_df = pd.read_csv('../input/sample_submission.csv')

FileNotFoundError: File b'../input/train.csv' does not exist

In [None]:
# Print out summary info about the train set
print('train set shape: {}'.format(train_df.shape))
train_df.head()

In [None]:
# Print out summary info about the test set
print('test set shape: {}'.format(test_df.shape))
test_df.head()

## Filter out the fake samples from the test set

In [None]:
def get_beautiful_test(test):
    
    test_rnd = np.round(test.iloc[:, 1:], 2)
    
    ugly_indexes = []
    non_ugly_indexes = []

    for idx in tqdm(range(len(test))):
        if not np.all(test_rnd.iloc[idx, :].values==test.iloc[idx, 1:].values):
            ugly_indexes.append(idx)
        else:
            non_ugly_indexes.append(idx)
    
    print('Count of real samples in the test set: {}'.format(len(non_ugly_indexes)))
    print('Count of fake samples in the test set: {}'.format(len(ugly_indexes)))

    np.save('test_ugly_indexes', np.array(ugly_indexes))
    np.save('test_non_ugly_indexes', np.array(non_ugly_indexes))

    test = test.iloc[non_ugly_indexes].reset_index(drop=True)

    return test, non_ugly_indexes, ugly_indexes

# Get a test set free of fake samples
test_df, non_ugly_indexes, ugly_indexes = get_beautiful_test(test_df)

## Combine the train and test set into a single dataframe

In [None]:
# Create the target columns in the test set
test_df['target'] = train_df['target'].mean()

# Update the test set's index
test_df.index = [i for i in range(len(train_df), len(train_df) + len(test_df))]

# Reorder the test set columns
test_df = test_df[train_df.columns.tolist()]

# Concatenate the train and test set
df = pd.concat([train_df, test_df])

print('full dataset shape: {}'.format(df.shape))

# Finding the leak columns

At around the middle of the competition timeframe, the leaderboard suddenly showed significant improvements from a top score around 1.35 to score below 1.0. This suggested that the dataset contained leaked targets. Not long after, Giba shared the leak with all players in one of his kernels: https://www.kaggle.com/titericz/the-property-by-giba

The magic columns:

In [None]:
# Get the magic columns
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
        '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
        'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b',
        '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992',
        'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd',
        '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
        '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
        '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']

Using the magic columns, we can create an algorithm that finds the order of the rows. After that, we can use the ordered rows to find more column sets and iteratively find more ordered rows and columns.

In [None]:
def find_column_sets(df, cols):
    
    # Create a dictionary to store the row sets for each lags
    lag_row_dic = {}
    lag = 1
    
    # Order the row indexes such that we start the search with those containing the most values
    df_rows = df.copy()[cols]
    df_rows['count'] = df_rows.astype(bool).sum(axis=1)
    df_rows = df_rows.sort_values('count', ascending=False)
    rows_copy = df_rows.index.tolist()
    
    # Recover memory
    del df_rows
    gc.collect()
    
    # Store the column set in a list
    col_set_list = [cols]
    
    # Create a trigger to stop the loop once no new columns are found
    column_trigger = True
    previous_column_count = 0

    while column_trigger:
        
        # Make a copy of the row list
        rows = rows_copy.copy()
        
        # Get 2 dfs that are offset by the lag
        cols_a = [col for col_set in col_set_list for col in col_set[:-lag]]
        cols_b = [col for col_set in col_set_list for col in col_set[lag:]]

        # Convert the dfs to numpy arrays
        df_a = df[cols_a].values
        df_b = df[cols_b].values

        # Create a list to store the row sets
        row_set_list = []
        count = 0

        # Check row by row if we can find a sequence
        while len(rows) > 0:

            # Add the row to the set
            row_set = [rows[0]]

            # Delete the rows from the rows to select
            rows = [i for j, i in enumerate(rows) if i != row_set[0]]

            # Get the vector to test the row
            a = df_a[row_set[0]]
            b = df_b[row_set[0]]

            # Finding if a new row follows the current row, if so add them to the row set. 
            # The new row becomes the current row and we add the rows iteratively
            
            trigger = True
            
            while np.sum(np.apply_along_axis(np.all, 1, np.equal(a, df_b))) == 1 and trigger:
                
                # Get the new row
                new_row = np.where(np.apply_along_axis(np.all, 1, np.equal(a, df_b)))[0][0]
                
                # This validates that the new potential row could not be used for another row
                if np.sum(np.apply_along_axis(np.all, 1, np.equal(df_b[new_row], df_a))) == 1:
                    
                    # Add the new row
                    row_set = row_set + [new_row]

                    # Update the current rows with the new rows
                    a = df_a[new_row]

                    # Remove the new found row from the rows to check
                    rows = [i for j, i in enumerate(rows) if i != new_row]

                else:
                    # This means there are more than one possible candidate, the loop should be stopped
                    print('New row {} could not be added - double'.format(new_row))
                    trigger = False
    
            trigger = True

            # Same as above, but for rows that precedes
            while np.sum(np.apply_along_axis(np.all, 1, np.equal(b, df_a))) == 1 and trigger:

                # Get the new row
                new_row = np.where(np.apply_along_axis(np.all, 1, np.equal(b, df_a)))[0][0]

                # This validates that the new potential row could not be used for another row
                if np.sum(np.apply_along_axis(np.all, 1, np.equal(df_a[new_row], df_b))) == 1:
                    
                    # Add the new row
                    row_set = [new_row] + row_set

                    # Update the current rows with the new rows
                    b = df_b[new_row]

                    # Remove the new found row from the rows to check
                    rows = [i for j, i in enumerate(rows) if i != int(new_row)]

                else:
                    # This means there are more than one possible candidate, the loop should be stopped
                    print('New row {} could not be added - double'.format(new_row))
                    trigger = False

            # Add the sequence of rows to the row set list
            row_set_list.append(row_set)

            count += 1
            if count % 100 == 0:
                print('\n{}'.format(time.strftime('%H:%M')))
                print('Completed {} set!'.format(count))
                print('{} rows remaining!'.format(len(rows)))

        lag_row_dic['lag_{}'.format(lag)] = row_set_list

        # Using the row sequences we found, we look for more column sets

        # Get the row set previously found
        row_set_list2 = lag_row_dic['lag_{}'.format(1)]
        row_set_list2 = [row_set for row_set in row_set_list2 if len(row_set) > 2]

        # Get the rows for each offset
        rows_a = [row for rowset in row_set_list2 for row in rowset[:-1]]
        rows_b = [row for rowset in row_set_list2 for row in rowset[1:]]

        # Get a list of columns
        col_list = [col for col in df.columns if (col != 'target' and col != 'ID')]

        # Get two numpy arrays with each offset
        df_a = df.copy()[col_list].iloc[rows_a].values
        df_b = df.copy()[col_list].iloc[rows_b].values

        # Get a list of column indexes to look through
        col_index_list = [i for i in range(df_a.shape[1])]
        
        # Create variables for the log and to store the sequences of columns
        count = 0
        col_set_count = 0
        col_set_list = []

        # Loop through all columns
        while len(col_index_list) > 0:

            # Add the current column to the set
            col_index = col_index_list[0]
            col_set = [col_list[col_index]]

            # Delete the cols from the cols to select
            col_index_list = [i for j, i in enumerate(col_index_list) if i != col_index]

            # Get a column to check both if a column precedes or follows the current one
            a = df_a[:, col_index]
            b = df_b[:, col_index]

            trigger = True
            
            # Add successive columns iteratively (similar process as for rows)
            while np.sum(np.apply_along_axis(np.all, 1, np.equal(a, df_b.T))) == 1 and trigger:

                new_col_index = np.where(
                    np.apply_along_axis(np.all, 1, np.equal(a, df_b.T))
                )[0][0]

                if np.sum(np.apply_along_axis(
                    np.all, 1, np.equal(df_b[:, new_col_index], df_a.T)
                )) == 1:

                    col_set = col_set + [col_list[new_col_index]]

                    a = df_a[:, new_col_index]

                    col_index_list = [
                        i for j, i in enumerate(col_index_list) if i != new_col_index
                    ]

                else:
                    trigger = False

            trigger = True

            while np.sum(np.apply_along_axis(
                np.all, 1, np.equal(b, df_a.T)
            )) == 1 and trigger:

                new_col_index = np.where(
                    np.apply_along_axis(np.all, 1, np.equal(b, df_a.T))
                )[0][0]

                if np.sum(np.apply_along_axis(
                    np.all, 1, np.equal(df_a[:, new_col_index], df_b.T)
                )) == 1:

                    col_set = [col_list[new_col_index]] + col_set

                    b = df_b[:, new_col_index]

                    col_index_list = [
                        i for j, i in enumerate(col_index_list) if i != new_col_index]

                else:
                    trigger = False

            # Count the full column sets
            if len(col_set) == 40:
                col_set_count += 1
            
            # Add the sequence of columns found to the col set list
            col_set_list.append(col_set)

            count += 1
            
            # Print out log
            if count % 10 == 0:
                print('\n{}'.format(time.strftime('%H:%M')))
                print('\nCompleted {} set!'.format(count))
                print('{} columns remaining!'.format(len(col_index_list)))
                print('{} full sets found!'.format(col_set_count))
                   
        # Get the sets of at least 2 columns
        final_col_set_list = [col_set for col_set in col_set_list if len(col_set)>1]
        
        # Store the sets of 40 columns (complete sets)
        final_col_set_list2 = [col_set for col_set in col_set_list if len(col_set)==40]
    
        # Store the new number of column sets
        new_column_set_count = len(final_col_set_list2)

        # If no new columns are found, we stop the loop
        # Otherwise, we go through a new loop with the new column sets
        if new_column_set_count > previous_column_count:
            previous_column_count = new_column_set_count
        else:
            column_trigger = False
    
    return lag_row_dic, final_col_set_list, final_col_set_list2

lag_row_dic, final_col_set_list, final_col_set_list2 = find_column_sets(df, cols)