In [39]:
# Imports
from PIL import Image
import numpy as np
import pandas as pd
import os
import re
from datetime import datetime

In [42]:
# I want to loop through files in the directory and import them
# Loop through files using OS functions
# Import images using PIL functions

# Manually get the directory
max_directory_in_str = "/Users/sabhyachhabria/Desktop/GitHub/Weather-Instablity-Analysis/data/maxTempsData"
directory = os.fsencode(max_directory_in_str)
maxImageFiles = []

for file in os.listdir(max_directory_in_str):
    filename = os.fsdecode(file)
    if filename.endswith(".tif"):
        maxImageFiles.append(filename)
        continue
    else:
        continue    

In [43]:
# The files in the (maxImageFiles) array are not sorted
# We need to sort them by date
# We want to sort the files chronologically - i.e. by date
# Before we can do that, we have to extract the date from the filename
print(maxImageFiles[100][18:26])

20090916


In [44]:
# All filenames have the same format - so extracting the date should be easy
# All dates are stored in the 8 chars from index 18 through 26
# However, it is safer to use Regular Expressions to match the string with a date format
maxDates = []
for x in range(len(maxImageFiles)):
    date = maxImageFiles[x][18:26]
    match = re.search(r'\d{4}\d{2}\d{2}', date)
    d = datetime.strptime(match.group(), '%Y%m%d').date()
    maxDates.append(d)

In [51]:
# Just making sure there is a 1-1 correspondence between the two lists
for x in range(5):
    print(maxImageFiles[x] + " --> " + str(maxDates[x]))

us.tmax_nohads_ll_20081020_float.tif --> 2008-10-20
us.tmax_nohads_ll_20060131_float.tif --> 2006-01-31
us.tmax_nohads_ll_20070517_float.tif --> 2007-05-17
us.tmax_nohads_ll_20060307_float.tif --> 2006-03-07
us.tmax_nohads_ll_20081216_float.tif --> 2008-12-16


In [56]:
# We now want to add the filenames and the dates to a pandas DataFrame
# First convert the lists to NumPy arrays
np_maxImageFiles = np.asarray(maxImageFiles)
np_maxDates = np.asarray(maxDates)
df_maxFiles = pd.DataFrame({'FileNames':np_maxImageFiles, 'Dates':np_maxDates})
df_maxFiles.head()

Unnamed: 0,FileNames,Dates
0,us.tmax_nohads_ll_20081020_float.tif,2008-10-20
1,us.tmax_nohads_ll_20060131_float.tif,2006-01-31
2,us.tmax_nohads_ll_20070517_float.tif,2007-05-17
3,us.tmax_nohads_ll_20060307_float.tif,2006-03-07
4,us.tmax_nohads_ll_20081216_float.tif,2008-12-16


In [65]:
# Now, after all of this processing, we want to sort the files by the date
df_maxFiles = df_maxFiles.sort_values('Dates')
# We also want to reset the index so that the DF starts from 0 again
df_maxFiles = df_maxFiles.reset_index(drop=True)
df_maxFiles.head(10)

Unnamed: 0,FileNames,Dates
0,us.tmax_nohads_ll_20060101_float.tif,2006-01-01
1,us.tmax_nohads_ll_20060102_float.tif,2006-01-02
2,us.tmax_nohads_ll_20060103_float.tif,2006-01-03
3,us.tmax_nohads_ll_20060104_float.tif,2006-01-04
4,us.tmax_nohads_ll_20060105_float.tif,2006-01-05
5,us.tmax_nohads_ll_20060106_float.tif,2006-01-06
6,us.tmax_nohads_ll_20060107_float.tif,2006-01-07
7,us.tmax_nohads_ll_20060108_float.tif,2006-01-08
8,us.tmax_nohads_ll_20060109_float.tif,2006-01-09
9,us.tmax_nohads_ll_20060110_float.tif,2006-01-10


In [79]:
# We could delete the dates from the DataFrame at this point as we have sorted the file names
# However, I'm still leaving it in there - may come in handy at a later point
# We now want to import the images and store them in the DataFrame
# Also have to set the cwd of the OS to the one that has the images
os.chdir("/Users/sabhyachhabria/Desktop/GitHub/Weather-Instablity-Analysis/data/maxTempsData/")
maxTIFImages = []
for index, row in df_maxFiles.iterrows():
    im = Image.open(row['FileNames'])
    maxTIFImages.append(im)

In [82]:
# Add the image files to the DataFrame
df_maxFiles['Images'] = pd.Series(maxTIFImages)

In [89]:
# Now we want to create a numpy array for each image with the pixel values
# Things may get a little complicated heres