# Deep Learning Project

In the first step the setup for the project will be done. The necessary data is download from the offical SVHN dataset distribution side and is saved as pickle file.

In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os, struct
import array
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import scipy.io as sio
%matplotlib inline



In [2]:
#Code block for downloading the dataset from the SVHN site.
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

In [3]:
"""
In this code block the file names of the SVHN download files are specified
and will be download. Also the size of the files will be checked to be 
correctly.
"""
train_filename = 'train.tar.gz'
test_filename = 'test.tar.gz'
extra_filename = 'extra.tar.gz'
train_32x32_filename = 'train_32x32.mat'
test_32x32_filename = 'test_32x32.mat'

maybe_download(train_filename, 404141560)
maybe_download(test_filename, 276555967)
maybe_download(extra_filename, 1955489752)
maybe_download(train_32x32_filename, 182040794)
maybe_download(test_32x32_filename, 64275384)
print("Data setup finished")

Attempting to download: train.tar.gz
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified train.tar.gz
Attempting to download: test.tar.gz
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified test.tar.gz
Attempting to download: extra.tar.gz
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified extra.tar.gz
Attempting to download: train_32x32.mat
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified train_32x32.mat
Attempting to download: test_32x32.mat
0%....5%....10%....15%....20%....

In [4]:
num_classes = 10
np.random.seed(133)


"""
This method unzips the download archive files.
"""
def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  if not os.path.exists(root+'/digitstruct.mat'):
    print('digitstruct.mat / labeling of the dataset is missing')

  
maybe_extract(train_filename)
maybe_extract(test_filename)
maybe_extract(extra_filename)

Extracting data for train. This may take a while. Please wait.
digitstruct.mat / labeling of the dataset is missing
Extracting data for test. This may take a while. Please wait.
digitstruct.mat / labeling of the dataset is missing
Extracting data for extra. This may take a while. Please wait.
digitstruct.mat / labeling of the dataset is missing


In [5]:
"""
The downloaded mats will be checked to be correctly.
"""
def checkMat(mat):
    data = sio.loadmat(mat)
    Y = data['y']
    
    if not len(np.unique(Y) == num_classes):
        print('something with the num classes is broken')
    else:
        print('Everything okay with file ' +mat)
    

checkMat(train_32x32_filename)
checkMat(test_32x32_filename)


Everything okay with file train_32x32.mat
Everything okay with file test_32x32.mat
