### Import stuff

In [None]:
import os
import numpy as np
import pandas as pd
from torchvision.io import read_image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt

### Write a custom Dataloader

In [None]:
# make a dictionary of possible values to normalize to
val_arr = [chr(32), chr(35), chr(37), chr(42), chr(43), chr(45), chr(46), chr(58), chr(61), chr(64)]
lin = np.linspace(-1, 1, num=10)

norm_dic = {}
for k, lin in zip(val_arr, lin):
    norm_dic[k] = lin
norm_dic

def to_number(x, normalize=True):
    if normalize:
        if (not type(x) is str and (x==None or np.isnan(x))):
            return -1.;
        return norm_dic[x]
        
    else:
        if (not type(x) is str and (x==None or np.isnan(x))):
            return 32;
        return ord(x)

class AsciiToDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None, height=40, width=80):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform
        self.height = height
        self.width = width
        
    def __len__(self):
        return len(self.img_labels)


    def __getitem__(self, idx, normalize=True):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])

        # create empty array
        if normalize:
            arr = np.zeros([40,80], dtype="float")
        else:
            arr = np.zeros([40,80], dtype="int")

        # indexes
        i, j = 0, 0

        # iterate through txt file and convert
        with open(img_path) as f:
            for line in f.readlines():
                for ch in line: 
                    if not (ch == "\n"):# leave out newline char
                        arr[j][i] = to_number(ch, normalize)
                    i += 1
                j += 1
                i = 0
        
        label = self.img_labels.iloc[idx, 1]
        return arr, label

### test dataset

In [None]:
annotation_file = "dataset/ascii.csv"
img_dir = "dataset/w80_h40_ascii"

test_dataset = AsciiToDataset(annotation_file, img_dir)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False) #shuffle=false to debug

train_features, train_labels = next(iter(test_dataloader))

In [None]:
print(train_features[0].shape)
print(train_features[9, 0])

### Sanity checks

In [None]:
img_dir = "dataset/w80_h40_ascii/109-ascii-art.txt"

with open(img_dir) as f:
    lines = f.readlines()
lines

### Data Analysis

In [None]:
def to_int(x):
    if (not type(x) is str and (x==None or np.isnan(x))):
        return 32;
    return ord(x)


def txt_to_numpy(img_path):
    # create empty array
    arr = np.zeros([40,80], dtype="int")
    
    # indexes
    i, j = 0, 0
    
    # iterate through txt file and convert
    with open(img_path) as f:
        for line in f.readlines():
            for ch in line: 
                if not (ch == "\n"):# leave out newline char
                    arr[j][i] = to_int(ch)
                i += 1
            j += 1
            i = 0
    
    return arr

In [None]:
# contains 25840 images

img_dir = "dataset/w80_h40_ascii"


count = 0
occurances = {} 

for animal in os.listdir(img_dir):
    filepath = os.path.join(img_dir, animal)
    if (os.path.isfile(filepath)):
        count += 1
        arr = txt_to_numpy(filepath)
        unique, counts = np.unique(arr, return_counts=True)
        dic = dict(zip(unique, counts))
        for key, value in dic.items():    
            try:
                occurances[key] += value
            except (KeyError):
                occurances[key] = value

count
occurances

### Occurances of different ascii chars in all images

```
{32: 4541142,  = space
 35: 7109820,  = #
 37: 5135616,  = %
 42: 9916954,  = *
 43: 11377591, = +
 45: 10650962, = -
 46: 6516918,  = .
 58: 9094245,  = :
 61: 12265363, = =
 64: 6079389}  = @
```

### plot data

In [None]:
dic = {32: 4541142,
 35: 7109820,
 37: 5135616,
 42: 9916954,
 43: 11377591,
 45: 10650962,
 46: 6516918,
 58: 9094245,
 61: 12265363,
 64: 6079389}

number_of_pics = 25840
sum_of_chars = 82688000 # = num_of_pics*40*80

dic = {x: 0 for x in range(32, 64)} | dic
xtick = {x: 0 for x in range(32, 64)} | dic

plt.bar(range(len(dic)), list(dic.values()))
plt.xticks(range(33), xtick, rotation=90);

### Idea: 
map each value from 32 to 64 to -1 to 1 evenly spaced
best might be 

In [None]:
np.linspace(-1, 1, num=10)

In [None]:
val_arr = [32, 35, 37, 42, 43, 45, 46, 58, 61, 64]
lin = np.linspace(-1, 1, num=10)

norm_dic = {}
for k, lin in zip(val_arr, lin):
    norm_dic[k] = lin
norm_dic