In [45]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib import colors as c
import pandas as pd

# Face Data Preprocessing

In [46]:
# Read validationing data labels
def get_face_labels(label_file):
    face_label = []
    for l in label_file:
        if(l == '1\n'):
            face_label.append(1)
        elif(l=='0\n'):
            face_label.append(0)
    return face_label


In [47]:
# Read Training data

def get_face_array(face_lines, face_label):
    face_list = []

    # Convert data from hashes and tabs to binary data
    for l in face_lines:
        temp = []
        for c in l:
            if(c=='#'):
                temp.append(1)
            elif c == '\n':
                continue
            else:
                temp.append(0)
        face_list.append(temp)
    # Since there are 451 labels, we divide it equally
    return np.array(np.array_split(face_list, len(face_label))).tolist()

In [48]:
def get_face_features(face_array):
    face_features = []
    for image in face_array:
        temp = np.zeros((10,10))
        for i in range(70):
            for j in range(60):
                if(image[i][j] == 1):
                    temp[math.floor(i/7)][math.floor(j/6)]+=1
        face_features.append(temp.flatten())
    face_features = np.array(face_features)
    return face_features


### Training Data

In [49]:
with open('./data/facedata/facedatatrainlabels') as f:
    label_file = f.readlines()

with open('./data/facedata/facedatatrain') as f:
    face_lines = f.readlines()


In [50]:
face_label = get_face_labels(label_file)
face_array = get_face_array(face_lines, face_label)
face_features = get_face_features(face_array)

In [51]:
face_df = pd.DataFrame({'Image_data':face_array,'Label':face_label})
for i in range(100):
    face_df['G'+str(i)] = face_features[:,i]
face_df.to_csv('./data/facedata/facetrainwithfeatures.csv', index=False)

### Validation Data

In [52]:
with open('./data/facedata/facedatavalidationlabels') as f:
    label_file = f.readlines()

with open('./data/facedata/facedatavalidation') as f:
    face_lines = f.readlines()

In [53]:
face_label = get_face_labels(label_file)
face_array = get_face_array(face_lines, face_label)
face_features = get_face_features(face_array)

In [54]:
face_df = pd.DataFrame({'Image_data':face_array,'Label':face_label})
for i in range(100):
    face_df['G'+str(i)] = face_features[:,i]
face_df.to_csv('./data/facedata/facevalidationwithfeatures.csv', index=False)

### Test Data

In [55]:
# Read training data labels

with open('./data/facedata/facedatatestlabels') as f:
    label_file = f.readlines()

with open('./data/facedata/facedatatest') as f:
    face_lines = f.readlines()

In [56]:
face_label = get_face_labels(label_file)
face_array = get_face_array(face_lines, face_label)
face_features = get_face_features(face_array)

In [57]:
face_df = pd.DataFrame({'Image_data':face_array,'Label':face_label})
for i in range(100):
    face_df['G'+str(i)] = face_features[:,i]
face_df.to_csv('./data/facedata/facetestwithfeatures.csv', index=False)

# Digit Data Preprocessing

In [58]:
def display_image(data):
    cMap = c.ListedColormap( ['black', 'white'] )
    plt.pcolormesh(data, cmap=cMap)
    plt.axes().set_aspect('equal')  
    plt.xticks([])  
    plt.yticks([])  
    plt.axes().invert_yaxis()
    plt.show()

In [59]:
def convert_text_to_array(text):
    temp_data = []
    for i in range(len(text)):
        temp=[]
        for j in range(len(text[0])):
            if(text[i][j]==" "):
                temp.append(0)
            elif(text[i][j]=="\n"):
                continue
            else:
                temp.append(1)
        temp_data.append(temp)
    return temp_data

In [60]:
def Subgrid(i, j, k, l, grid):
    return [item[k:l] for item in grid[i:j]]

In [61]:
def generate_features(df,inner=4,outer=7):
  grid = []
  for i in range(outer**2):
    grid.append([])

  for row in df.itertuples():

    for i in range(outer):
      for j in range(outer):
        temp = Subgrid( i* inner, (i+1)*inner, j*inner, (j+1)*inner, row[1])
        # display_image( temp )
        # grid[i*outer + j].append( inner**2 - sum(x.count(0) for x in temp) )
        grid[i*outer + j].append( sum(x.count(1) for x in temp) )
  
  return grid

### Train Data

In [62]:
with open("./data/digitdata/trainingimages") as f:
    lines = f.readlines()

In [63]:
train_data=[]
for i in range( len(lines)//28 ):
    train_data.append(convert_text_to_array(lines[i*28:i*28+28]))

In [64]:
with open("./data/digitdata/traininglabels") as f:
    lables = f.readlines()
train_labels = [ x[0] for x in lables]

In [65]:
train_df = pd.DataFrame({'Image_data':train_data, 'Label':train_labels})
train_df.to_csv("./data/digitdata/traindata.csv", index=False)

In [66]:
grid = generate_features(train_df)

In [67]:
for i in range(len(grid)):
    train_df['G'+str(i)] = grid[i]


In [68]:
train_df.to_csv("./data/digitdata/fortyninetrainfeatures.csv", index=False)

### Validation Data

In [69]:
with open("./data/digitdata/validationimages") as f:
    lines = f.readlines()

In [70]:
validation_data=[]
for i in range( len(lines)//28 ):
    validation_data.append(convert_text_to_array(lines[i*28:i*28+28]))

In [71]:
with open("./data/digitdata/validationlabels") as f:
    lables = f.readlines()
validation_labels = [ x[0] for x in lables]

In [72]:
validation_df = pd.DataFrame({'Image_data':validation_data, 'Label':validation_labels})
validation_df.to_csv("./data/digitdata/validationdata.csv", index=False)

In [73]:
grid = generate_features(validation_df)

In [74]:
for i in range(len(grid)):
    validation_df['G'+str(i)] = grid[i]


In [75]:
validation_df.to_csv("./data/digitdata/fortyninevalidationfeatures.csv", index=False)

### Test Data

In [76]:
with open("./data/digitdata/testimages") as f:
    lines = f.readlines()

In [77]:
test_data=[]
for i in range( len(lines)//28 ):
    test_data.append(convert_text_to_array(lines[i*28:i*28+28]))

In [78]:
with open("./data/digitdata/testlabels") as f:
    lables = f.readlines()
test_labels = [ x[0] for x in lables]

In [79]:
test_df = pd.DataFrame({'Image_data':test_data, 'Label':test_labels})
test_df.to_csv("./data/digitdata/testdata.csv", index=False)

In [80]:
grid = generate_features(test_df)

In [81]:
for i in range(len(grid)):
    test_df['G'+str(i)] = grid[i]


In [82]:
test_df.to_csv("./data/digitdata/fortyninetestfeatures.csv", index=False)