In [33]:
import sys
import os
import pandas as pd
import numpy as np
import math
import exiftool
import glob
import re
import base64
import sqlalchemy
from sqlalchemy.dialects import postgresql
from ast import literal_eval

from sqlalchemy import create_engine
engine = create_engine('postgresql://%(PGUSER)s:%(PGPASSWORD)s@localhost:5432/work' % os.environ)

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)

# Data import from CSVs to Postgres

In [34]:
# various columns that might be comments/descriptions/etc for the files
COMMENT_COLS = {
    "XMP:UserComment", "QuickTime:Description", "IPTC:Caption-Abstract",
    "File:Comment", "XMP:Description", "EXIF:ImageDescription", "EXIF:UserComment", "EXIF:XPComment"
    }

Read metadata files creating from local_photos.py process

In [35]:
files = glob.glob("data/image-metadata-*")
dfs = []
for f in files:
    dfs.append(pd.read_csv(f, low_memory=False))
df_raw = pd.concat(dfs, ignore_index=True, sort=False)
df_raw_summary = df_raw.notnull().sum()
keep_cols = []
str_vals = []
row_count = df_raw.shape[0]
for k, v in df_summary.iteritems():
    cnt = float(v)
    # 10% of files and not a warning or description fields or date/size/w/h
    if (cnt/row_count > 0.1 and not re.match(r'(warning|unname)', k.lower())) \
            or (k in COMMENT_COLS) \
            or re.match(r'(date|width|height|size)', k.lower()):
        keep_cols.append(k)


Create the data frame suitable for import into a database and filtered to select columns

In [36]:
df_import = df_raw[keep_cols].copy()
df_import.columns = df_import.columns.str.lower()
df_import.columns = df_import.columns.str.replace('[^a-z0-9]+', '_', regex=True)
df_import = df_import[sorted(df_import.columns)].copy()
df_import['file_path'] = df_import['file_directory'] + '/' + df_import['file_filename']
df_import = df_import.replace(to_replace= r'\\', value= '', regex=True)
dtypes = {}
for col in df_import.columns:
    if re.match(r'thumb.*[0-9]', col):
        df_import[col] = df_import[col].apply(literal_eval)
        dtypes[col] = postgresql.ARRAY(sqlalchemy.types.SMALLINT)

Create a small version of the postgres table in a database. We will subsequently use a csv and bulk `COPY` for better speed and debugability.

In [37]:
df_import.head(10).to_sql('photo_file_new', con=engine, method="multi", if_exists="fail", index=False, dtype=dtypes)

10

Create copy of the postgres-typed dataframe so that we can export to a csv that is readyfor a bulk copy to postgres.

In [38]:
# reformat data for csv loading
df_export = df_import.copy()
for col in df_export.columns:
    if re.match(r'thumb.*[0-9]', col):
        df_export[col] = '{' + df_export[col].apply(lambda d: ','.join([str(dd) for dd in d])) + '}'
        df_export[col] = df_export[col].astype(str)
df_export.to_csv("photo_file_new.txt", sep='\t', header=False, na_rep='', index=False)

Import into postgres by running this command on the command line outside of notebook.

`cat photo_file_new.txt | psql -U postgres -d work -c "COPY photo_file_new from STDIN WITH NULL ''"`

# Bring Google Photos Metadata into Postgres

In [None]:
df_goog = pd.read_json("google-photos.jsonline", lines=True)

In [None]:
df_goog['creation_time'] = pd.to_datetime(df_goog["mediaMetadata"].apply(lambda x: x['creationTime']))
df_goog['width'] = df_goog["mediaMetadata"].apply(lambda x: int(x['width']))
df_goog['height'] = df_goog["mediaMetadata"].apply(lambda x: int(x['height']))
df_goog['camera_make'] = df_goog["mediaMetadata"].apply(lambda x: x.get("photo", {}).get('cameraMake'))
df_goog['camera_model'] = df_goog["mediaMetadata"].apply(lambda x: x.get("photo", {}).get('cameraModel'))
df_goog.drop("mediaMetadata", inplace=True, axis=1)

In [None]:
df_goog.to_sql('photo_google', con=engine, method="multi", if_exists="fail")

## Cleanup, analysis, research

In [None]:
# looking for latitude
list(filter(lambda d: 'long' in d.lower(), df_final.columns))

In [None]:
# look at one row of data
df_import.iloc[[10589]].transpose()

In [None]:
img = base64.b64decode(df['thumb_color'][1])
fw = open("img_color.jpg", "wb")
fw.write(img)
fw.close()