In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports and toolboxes needed for workflow
# System imports
import os
import sys

# Mathematical imports
import numpy as np
import math

# To make this notebook's output stable across runs
np.random.seed(42)

# Data structure imports
import csv
from collections import Counter # https://docs.python.org/2/library/collections.html
import re
import pandas as pd
from pandas import set_option
set_option('display.max_columns',400)


# Machine Learning and Statistical imports
import scipy
from sklearn.cluster import KMeans


# Computer Vision imports
import cv2
import PIL
from PIL import Image
from skimage.color import rgb2lab, deltaE_cie76


# Visualization imports
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import image as img
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rcParams['figure.figsize'] = (100,100)

import seaborn as sns

from bokeh.io import push_notebook, show, output_notebook
from bokeh.models import ColumnDataSource, ColorBar
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
from bokeh.plotting import figure
output_notebook()

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGE_ID = "EDA_Images"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "Images", IMAGE_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


# Ignore useless warnings 
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore', DeprecationWarning)


In [2]:
%%time
mydir='/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/'

def createFileList(myDir, format='.csv'):
    filelist = []
    print('\nCurrent Working Directory:\n', myDir, end='\n')
    
    for root, dirs, files in os.walk(myDir, topdown=False):
        for file in files:
            if file.endswith(format):
                fullName = os.path.join(root, file)
                filelist.append(fullName)
        print('\nList of SIC Data:\n', filelist, end='\n')
    return filelist

filelist = createFileList(mydir)


Current Working Directory:
 /Users/peterflaming/Documents/GitHub/COMPS/Raw Data/

List of SIC Data:
 ['/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_8_advancesearch19810233385e1a613f38c47.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_3_advancesearch16953607965e1a5cf27817a.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_5_advancesearch8208422685e1a5e4281b83.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_6_advancesearch15668837465e1a60171bf79.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/~$SIC_0_advancesearch16784737975e1a5b2c8cbeb.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_1_advancesearch12906045365e1a5bb125324.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_0_advancesearch16784737975e1a5b2c8cbeb.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_4_advancesearch14170920915e1a5dbd6cddc.csv', '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_7_advancesearch54685958

In [3]:
%%time
# Sort and view the list of images
filelist.sort()
filelist

CPU times: user 13 µs, sys: 0 ns, total: 13 µs
Wall time: 16 µs


['/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_0_advancesearch16784737975e1a5b2c8cbeb.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_1_advancesearch12906045365e1a5bb125324.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_2_advancesearch15616274825e1a5c405bce4.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_3_advancesearch16953607965e1a5cf27817a.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_4_advancesearch14170920915e1a5dbd6cddc.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_5_advancesearch8208422685e1a5e4281b83.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_6_advancesearch15668837465e1a60171bf79.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_7_advancesearch5468595875e1a60a2066b1.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_8_advancesearch19810233385e1a613f38c47.csv',
 '/Users/peterflaming/Documents/GitHub/COMPS/Raw Data/SIC_9_advancesearch62