In [33]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import pickle

In [34]:
L = 4
BASE_PATH = '~/repos/z3stringnet/'

In [35]:
df = pd.read_csv(os.path.join(BASE_PATH, 'plaquett_lattice_bin_0.csv'))

In [36]:
df

Unnamed: 0,x,y,N,E,S,W
0,0,0,W,N,B,N
1,1,0,W,S,B,N
2,2,0,B,S,E,S
3,3,0,W,N,B,S
4,1,1,E,N,W,N
5,2,1,W,B,B,N
6,3,1,B,N,W,B
7,0,1,B,N,W,N
8,0,2,E,B,B,N
9,1,2,B,B,E,B


# Link Encodings

It is worth trying some different encodings. I can see some benifits 
and problems with the different approached and will probably just 
need to work through them by trying them. Here are some notes and
thoughts.

* Right now I believe the best thing to do is to physicaly represent links
  in a "location" We will have zeros "represent" the locations on the
  verticies and the centers of the plaquetts. 
* If you do the above you  still have some options for labeling the links.
  - You can use 1, 2, 3, 4, 5 to represent blank, up (N), down (S), right (E),
    left (W) respectivly. In this case horizontal links can only  take values
    1, 4, 5. Vertical links can only take values 1, 2, 3
  - You could also use values 1, 2, 3 for blank, up/righ, down/left
    and the contect of the position in the matrix which determines if it
    is a horizontal link or a vertical link would distiguish between
    up and right, and down and left.
* Of the above I think the second one would be nicer but I'm not sure if
  you could confuse the encoder to something important. GOing to start with
  the first subbullet above and after I get something working try the second

In [37]:
def string_to_number_directions(r, column):
    link_str = r[column]
    if link_str == 'B':
        return 1
    elif link_str == 'N':
        return 2
    elif link_str == 'S':
        return 3
    elif link_str == 'E':
        return 4
    elif link_str == 'W':
        return 5

def apply_string_to_number_all_directions(df_in):
    df_copy = df_in.copy()
    df_copy['n_number'] = df_copy.apply(lambda r: string_to_number_directions(r, 'N'), axis=1)
    df_copy['e_number'] = df_copy.apply(lambda r: string_to_number_directions(r, 'E'), axis=1)
    df_copy['s_number'] = df_copy.apply(lambda r: string_to_number_directions(r, 'S'), axis=1)
    df_copy['w_number'] = df_copy.apply(lambda r: string_to_number_directions(r, 'W'), axis=1)
    return df_copy

In [38]:
df = apply_string_to_number_all_directions(df)

In [39]:
df.set_index(['x', 'y'], inplace=True)

In [40]:
def determine_lattice_size(df_in):
    # Assume square lattice
    max_x = df_in['x'].values().max()
    max_y = df_in['y'].values().max()
    if max_x != max_y:
        raise ValueError('Expecting x==y dimensions')
    return max_x

In [41]:
def check_if_exists(cur_val, proposed_val, v=False):
    if v:
        print('  In check_if_exists\n')
        print(f'    cur_val {cur_val}')
        print(f'    proposed_val {proposed_val}')
    if cur_val == 0:
        pass
    elif cur_val != proposed_val:
        raise ValueError("Discovered inconsistancy in representation.")

In [42]:
def create_full_numerical_representation(df_in, l, v=False):
    """
    Creates a matrix of numbers that can be interpreted by a CNN auto encoder.
    
    Requires running the ``string_to_number_directions`` function first.
    
    Checks for consistancy in the plaquette representations of the configurations.
    
    Arguments:
        df (DataFrame): is the dataframe of a plaquette representation of
        a configuration for which you have run the ``string_to_number_directions``
        on.
        
    Returns:
        A numpy matrix with zeros representing the verticies and centers of
        plaquetts and the number system as described by ``string_to_number_directions``
        representing the links.
    """
    df_working = df_in.copy()
    
    #l = determine_lattice_size(df_working)
    # Will return this matrix
    m = np.zeros([2 * l, 2 * l])
    for i in range(l):
        for j in range(l):
            cur_row = df_working.loc[j, i]
        
            # For all entries we will check for consistancy between the plaquetts.
            # E.g. bottom(top) of the previous row of plaquetts with the top(bottom)
            # of the current row -> these need to be the same and if they are not their
            # is either a problem with the way you are writing the plaquetts to file, or
            # with the algorithm generating the configurations.
            horz_index_x = j * 2 + 1
            horz_index_y = -(i * 2) - 1
            vert_index_x = j * 2
            vert_index_y = -(i * 2 + 1) - 1
            if v:
                print(f'i (y): {i}')
                print(f'j (x): {j}')
                print(f'horz_index_x {horz_index_x}')
                print(f'horz_index_y {horz_index_y}')
                print(f'vert_index_x {vert_index_x}')
                print(f'vert_index_y {vert_index_y}')

            # horizontal
            check_if_exists(m[horz_index_y, horz_index_x], cur_row['s_number'], v=v)
            m[horz_index_y, horz_index_x] = cur_row['s_number']
            check_if_exists(m[-((-horz_index_y + 2) % (2 * l)), horz_index_x], cur_row['n_number'], v=v)
            m[-((-horz_index_y + 2) % (2 * l)), horz_index_x] = cur_row['n_number']

            # vertical
            check_if_exists(m[vert_index_y, vert_index_x], cur_row['w_number'], v=v)
            m[vert_index_y, vert_index_x] = cur_row['w_number']
            check_if_exists(m[vert_index_y, (vert_index_x + 2) % (2 * l)], cur_row['e_number'], v=v)
            m[vert_index_y, (vert_index_x + 2) % (2 * l)] = cur_row['e_number']
            if v:
                print('current m:\n')
                print(m)
    return m

In [43]:
create_full_numerical_representation(df, L, v=False)

array([[2., 0., 2., 0., 1., 0., 2., 0.],
       [0., 4., 0., 1., 0., 1., 0., 4.],
       [2., 0., 1., 0., 1., 0., 3., 0.],
       [0., 1., 0., 4., 0., 5., 0., 1.],
       [2., 0., 2., 0., 2., 0., 1., 0.],
       [0., 5., 0., 5., 0., 1., 0., 5.],
       [2., 0., 2., 0., 3., 0., 3., 0.],
       [0., 1., 0., 1., 0., 4., 0., 1.]])

In [54]:
matrix_list = []
for i in tqdm(range(0, 999900, 100)):
    current_df = pd.read_csv(os.path.join(BASE_PATH, f'plaquett_lattice_bin_{i}.csv'))
    current_df = apply_string_to_number_all_directions(current_df)
    current_df.set_index(['x', 'y'], inplace=True)
    current_matrix = create_full_numerical_representation(current_df, L)
    matrix_list.append(current_matrix)

100%|██████████| 9999/9999 [01:33<00:00, 106.59it/s]


In [55]:
len(matrix_list)

9999

In [56]:
all_data = np.array(matrix_list)

In [57]:
with open('pickle_data.pkl', 'wb') as f:
    pickle.dump(all_data, f)