In [1]:
#https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/1_notmnist.ipynb

import timeit
script_start_time = timeit.default_timer()

import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
#from sklearn.linear_model import LogisticRegression
from six.moves import cPickle as pickle
#from six.moves.urllib.request import urlretrieve
from urllib.request import urlretrieve

# ********** Logging settings

import logging

logger = logging.getLogger('notMNIST.dataSetting')

file_log_handler = logging.FileHandler('logfile.log')
logger.addHandler(file_log_handler)

stderr_log_handler = logging.StreamHandler()
logger.addHandler(stderr_log_handler)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_log_handler.setFormatter(formatter)
stderr_log_handler.setFormatter(formatter)

logger.setLevel('DEBUG')

def logInfo(*args):
  logger.info(concatenate(args))

def logDebug(*args):
  logger.debug(concatenate(args))
  
def logError(*args):
  logger.error(concatenate(args))

def concatenate(args):
  return ' '.join(str(v) for v in args)

# ********** End of Logging settings

# ************* DOWNLOAD

logInfo('DOWNLOADING ...')
start_time = timeit.default_timer()

url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = '.' # Change me to store data elsewhere

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    logInfo('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
    logInfo('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    logInfo('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)


logInfo('Maybe downloaded ({:f} sec).'.format(timeit.default_timer() - start_time))

# ************* EXTRACT

logInfo('EXTRACTING ...')
start_time = timeit.default_timer()


num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    logInfo('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    logInfo('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  logInfo(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)


logInfo('Maybe extracted ({:f} sec).'.format(timeit.default_timer() - start_time))
        
# ************* PICKLE

logInfo('PICKLING ...')
start_time = timeit.default_timer()



image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
  logInfo(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (imageio.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except (IOError, ValueError) as e:
      logInfo('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  logInfo('Full dataset tensor:', dataset.shape)
  logInfo('Mean:', np.mean(dataset))
  logInfo('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      logInfo('%s already present - Skipping pickling.' % set_filename)
    else:
      logInfo('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        logError('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)

logInfo('Maybe pickled ({:f} sec).'.format(timeit.default_timer() - start_time))

# *************** data check

logInfo('DATA CHECKING ...')
start_time = timeit.default_timer()

def check_datasets(pickle_file):
      
  try:
    with open(pickle_file, 'rb') as f:
      letter_set = pickle.load(f)

      means = np.mean(letter_set, axis=tuple(range(1, 3)))
      stds = np.std(letter_set, axis=tuple(range(1, 3)))

      logInfo('Mean for ', pickle_file, ' : min ', np.min(means), ' max ', np.max(means))
      logInfo('Standard deviation for ', pickle_file, ' : min ', np.min(stds), ' max ', np.max(stds))

      #display random image

      #il valore di sample_image dovrebbero essere modificati
      #secondo https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow
      #"MxN arrays are mapped to colors based on the norm (mapping scalar to scalar) and the cmap (mapping the normed scalar to a color)"
      sample_idx = np.random.randint(len(letter_set))  # pick a random image index
      sample_image = letter_set[sample_idx, :, :]  # extract a 2D slice
      plt.figure()
      plt.imshow(sample_image)  # display it
      plt.show()

      #plot means and stndard deviations for all letters in file

      #meansAndStds = np.vstack([means, stds])
      #plt.plot(range(0, len(letter_set)), meansAndStds.T, linewidth=2)
      #plt.show()

      #write means and stndard deviations for all letters in file

      #for letter in letter_set:
        #logInfo('Mean:', np.mean(letter))
        #logInfo('Standard deviation:', np.std(letter))


  except Exception as e:
    logError('Unable to process data from', pickle_file, ':', e)
    raise
    

#check_datasets(train_datasets[0])
#check_datasets(test_datasets[0])

def check_dataset_shape(datasets, dataset_type):
  for i, dataset in enumerate(datasets):
    with open(dataset, 'rb') as f:
        letter_set = pickle.load(f)
        logInfo(dataset_type + "_datasets[" + str(i) + "] shape:", letter_set.shape)

#check_dataset_shape(train_datasets, "train")
#check_dataset_shape(test_datasets, "test")

logInfo('Maybe data checked ({:f} sec).'.format(timeit.default_timer() - start_time))

# **************  MERGE DATASETS

logInfo('MERGING DATASETS ...')
start_time = timeit.default_timer()


def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      logError('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels
            
#valori iniziali           
# train_size = 200000
# valid_size = 10000
# test_size = 10000

#valori accettabili per "regola dei 30". Da rivedere: va applicata su train, valid o test?
train_size = 100000
valid_size = 30000
test_size = 10000

# train_size = 5000
# valid_size = 1000
# test_size = 1000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

logInfo('Training:', train_dataset.shape, train_labels.shape)
training_means = np.mean(train_dataset, axis=tuple(range(1, 3)))
training_stds = np.std(train_dataset, axis=tuple(range(1, 3)))
logInfo('Training Mean: min ', np.min(training_means), ' max ', np.max(training_means))
logInfo('Training Standard deviation: min ', np.min(training_stds), ' max ', np.max(training_stds))

logInfo('Validation:', valid_dataset.shape, valid_labels.shape)
valid_means = np.mean(valid_dataset, axis=tuple(range(1, 3)))
valid_stds = np.std(valid_dataset, axis=tuple(range(1, 3)))
logInfo('Validation Mean: min ', np.min(valid_means), ' max ', np.max(valid_means))
logInfo('Validation Standard deviation: min ', np.min(valid_stds), ' max ', np.max(valid_stds))

logInfo('Testing:', test_dataset.shape, test_labels.shape)
test_means = np.mean(test_dataset, axis=tuple(range(1, 3)))
test_stds = np.std(test_dataset, axis=tuple(range(1, 3)))
logInfo('Validation Mean: min ', np.min(test_means), ' max ', np.max(test_means))
logInfo('Validation Standard deviation: min ', np.min(test_stds), ' max ', np.max(test_stds))

logInfo('Dataset merged ({:f} sec).'.format(timeit.default_timer() - start_time))

# ******* RANDOMIZE

logInfo('RANDOMIZING ...')
start_time = timeit.default_timer()

def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

logInfo('Randomized ({:f} sec).'.format(timeit.default_timer() - start_time))

# *************** data check after randomizing

logInfo('DATA CHECKING AFTER RANDOMIZING...')
start_time = timeit.default_timer()

def check_datasets_after_rand(dataset, labels, dataset_type):

  logInfo(dataset_type + '_dataset shape: ', dataset.shape)
  logInfo(dataset_type + '_labels shape: ', labels.shape)

  #display random image

  #il valore di sample_image dovrebbero essere modificati
  #secondo https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow
  #"MxN arrays are mapped to colors based on the norm (mapping scalar to scalar) and the cmap (mapping the normed scalar to a color)"
  sample_idx = np.random.randint(len(labels))  # pick a random image index

  logInfo('Displaying label: ', chr(65 + labels[sample_idx]))
  sample_image = dataset[sample_idx, :, :]  # extract a 2D slice
  plt.figure()
  plt.imshow(sample_image)  # display it
  plt.show()


# check_datasets_after_rand(train_dataset, train_labels , "train")
# check_datasets_after_rand(valid_dataset, valid_labels , "valid")
# check_datasets_after_rand(test_dataset, test_labels , "test")

def from_3d_to_1d(x):
    x = x.reshape((x.shape[0], x.shape[1] * x.shape[2]))  
    #print(x)
    x = np.ascontiguousarray(x)
    dt = np.dtype((np.void, x.dtype.itemsize * x.shape[1]))   
    #print(dt)
    view = x.view(dt)   
    #print(view)
    ravel = view.ravel()   
    #print(ravel)
    return ravel


def count_overlapping_items(x, y):
  intersection = np.in1d(from_3d_to_1d(x), from_3d_to_1d(y))
  return len(np.where(intersection)[0])
  

# logInfo('Overlapping examples in train_dataset', count_overlapping_items(train_dataset, train_dataset))
# logInfo('Overlapping examples in valid_dataset', count_overlapping_items(valid_dataset, valid_dataset))
# logInfo('Overlapping examples in test_dataset', count_overlapping_items(test_dataset, test_dataset))

# logInfo('Examples in training found in validation', count_overlapping_items(train_dataset, valid_dataset))
# logInfo('Examples in validation found in training', count_overlapping_items(valid_dataset, train_dataset))
# logInfo('Examples in training found in test', count_overlapping_items(train_dataset, test_dataset))
# logInfo('Examples in test found in training', count_overlapping_items(test_dataset, train_dataset))
# logInfo('Examples in validation found in test', count_overlapping_items(valid_dataset, test_dataset))
# logInfo('Examples in test found in validation', count_overlapping_items(test_dataset, valid_dataset))


logInfo('Maybe data checked after randomization ({:f} sec).'.format(timeit.default_timer() - start_time))

# ******* SAVE IN ONE FILE

logInfo('SAVING IN ONE FILE ...')
start_time = timeit.default_timer()


pickle_file = os.path.join(data_root, 'notMNIST.pickle')

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  logError('Unable to save data to', pickle_file, ':', e)
  raise

statinfo = os.stat(pickle_file)
logInfo('Compressed pickle size:', statinfo.st_size)


logInfo('Saved in one file ({:f} sec).'.format(timeit.default_timer() - start_time))


logInfo('Script execution time: {:f} sec.'.format(timeit.default_timer() - script_start_time))





2018-03-04 00:14:13,150 - notMNIST.dataSetting - INFO - DOWNLOADING ...
2018-03-04 00:14:13,152 - notMNIST.dataSetting - INFO - Found and verified ./notMNIST_large.tar.gz
2018-03-04 00:14:13,154 - notMNIST.dataSetting - INFO - Found and verified ./notMNIST_small.tar.gz
2018-03-04 00:14:13,156 - notMNIST.dataSetting - INFO - Maybe downloaded (0.004346 sec).
2018-03-04 00:14:13,157 - notMNIST.dataSetting - INFO - EXTRACTING ...
2018-03-04 00:14:13,159 - notMNIST.dataSetting - INFO - ./notMNIST_large already present - Skipping extraction of ./notMNIST_large.tar.gz.
2018-03-04 00:14:13,161 - notMNIST.dataSetting - INFO - ['./notMNIST_large/A', './notMNIST_large/B', './notMNIST_large/C', './notMNIST_large/D', './notMNIST_large/E', './notMNIST_large/F', './notMNIST_large/G', './notMNIST_large/H', './notMNIST_large/I', './notMNIST_large/J']
2018-03-04 00:14:13,162 - notMNIST.dataSetting - INFO - ./notMNIST_small already present - Skipping extraction of ./notMNIST_small.tar.gz.
2018-03-04 00:1