In [49]:
from bs4 import BeautifulSoup
import re
from PIL import Image
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from unidecode import unidecode

In [50]:
def building_dataframe(htmlfile):
    '''This function extracts all of the relevant data from each html file iteratively.
    It constructs a pandas table for each building, with its jpg files as the row indexes'''
    
    file1 = open(htmlfile, "r")
    soup = BeautifulSoup(file1, 'html.parser')
        
    jpglist =[str((link.get('href')).split('/')[-1]) for link in soup.find_all('a') if ("flickr" and "jpg") in (link.get('href'))]
    # This finds filenames for all of the associated jpgs
    if jpglist == []:
        return 
    
    architect_val = [(link.get('href')) for link in soup.find_all('a') if ("firms/") in (link.get('href'))][1].split('/')[-1]
    date_val = str(soup.find('table', id='building_info_tbl').find('td', text='Date').findNext('td'))
    style_val = [(link.get('href')) for link in soup.find_all('a') if ("styles/") in (link.get('href'))][1].split('/')[-1]
    type_val = [(link.get('href')) for link in soup.find_all('a') if ("types/") in (link.get('href'))][1].split('/')[-1]
    buildingname_val = htmlfile.split('/')[-1].split('.')[0]
    latitude_val = soup.find(string=re.compile("Latitude"))
    longitude_val = soup.find(string=re.compile("Longitude"))
    city_val = [(link.get('href')) for link in soup.find_all('a') if ("locations/city") in (link.get('href'))][0].split('/')[-1].split('-')[0]
    country_val = [(link.get('href')) for link in soup.find_all('a') if ("locations/country") in (link.get('href'))][0].split('/')[-1]
    tag_vals = list(set([(link.get('href')).split('/')[-1] for link in soup.find_all('a') if ("/tags/" and "/name/") in (link.get('href'))]))
    # This generates links to all of the tags for each building

    columns = ['jpg_name','building_name', 'building_date', 'building_city', 'building_country', 'building_latitude', 'building_longitude', 'building_architect', 'building_style', 'building_type']
    building_df = pd.DataFrame(columns=columns)
    building_df.jpg_name = jpglist

    tag_df = pd.DataFrame(1, index=np.arange(len(building_df)), columns=['feature_' + a for a in tag_vals])
    building_df = pd.concat([building_df, tag_df], axis=1)
    
    building_df.building_architect = architect_val
    building_df.building_country = country_val
    building_df.building_city = city_val
    building_df.building_date = date_val.strip('<td valign="top">').strip('</')
    building_df.building_name = buildingname_val
    building_df.building_latitude = latitude_val
    building_df.building_longitude = longitude_val
    building_df.building_style = style_val
    building_df.building_type = type_val
    building_df = building_df[pd.notnull(building_df.jpg_name)].set_index('jpg_name')
    return building_df

def lat_clean(inputstr):
    return float(inputstr.strip('\n').strip().strip('\tLatitude(').strip('),'))

def lon_clean(inputstr):
    return float(inputstr.strip('\n').strip().strip('\tLongitude(').strip('),'))

def BC_to_minus(date_string):
    ''' This function checks for the presence of 'BC' in a date, and converts it to a minus sign'''
    minus_string = ''
    if 'BC' in str(date_string):
        minus_string = '-'  
    date_draft = minus_string + str(date_string)
    return date_draft
    
def date_clean(date_string):
    ''' This function checks for range dates (ie. 200BC - 800AD) and converts them to ints (-200 - 800)
    and then averages them. If the date is not a compound date, then it checks for BC, and returns a single
    number ie. -800 or 1200.'''
    if '-' in date_string:
        date_string = (int(BC_to_minus(date_string.split('-')[0]).strip(' BC').strip(' AD')) +  \
        int(BC_to_minus(date_string.split('-')[1]).strip(' BC').strip(' AD')))/2
    date_string = str(BC_to_minus(date_string)).strip(' BC').strip(' AD').replace(" ", "")
    return date_string

def style_replace(textstr):
    '''Replaces the names of styles to avoid searching for special characters'''
    if 'covenesc' in textstr:
        textstr = 'brancovenesc'
    elif 'mud' in textstr:
        textstr = 'mudejar'
    else:
        textstr = textstr
    return textstr

def letter_replace(string):  
    '''Replaces certain non ascii characters with their ascii approximations'''
    replace_dict = {'\xc3\xa3':'a', '\xc3\xb5':'o', '\xc5\x8d':'o', '\xc3\x9f':'ss', 
                '\xc3\xad':'i', '\xc3\xa1':'a', '\xc3\xa8':'e', '\xc4\xb1':'i', '\xda':'u', '\xed':'i',
                '\xc5':'a', '\xe9':'e', '\xe2':'a'}
    for k in replace_dict:  
        string = string.replace(k,replace_dict[k])  
    return string

def tidy_main_table(raw_concat_input):
    ''' After the whole database of buildings has been assembled into a single pandas table, this function
    cleans up the values for some of the columns'''
    architecture_csv = raw_concat_input

    #Tidy Lon and Lat
    architecture_csv.building_latitude = architecture_csv.building_latitude.apply(lat_clean)
    architecture_csv.building_longitude = architecture_csv.building_longitude.apply(lon_clean)
    architecture_csv = architecture_csv.dropna(subset = ['building_latitude', 'building_longitude'])

    #Drop duplicate images and drop rows which don't have associated photos in the file
    architecture_csv = architecture_csv.reset_index().drop_duplicates(subset='jpg_name', keep='last').set_index('jpg_name')
    architecture_csv = architecture_csv.drop(list(set(architecture_csv.index) - set(photo_names)))

    #Sorts out subcategories for building types and styles (turns 2 cols into 4)
    styles_db = pd.DataFrame(architecture_csv.building_style.str.split('---',1).tolist(), columns = ['building_main_style','building_sub_style'], index = architecture_csv.index)
    types_db = pd.DataFrame(architecture_csv.building_type.str.split('---',1).tolist(), columns = ['building_main_type','building_sub_type'], index = architecture_csv.index)
    architecture_csv = pd.concat([architecture_csv, styles_db, types_db], axis=1).drop(['building_type', 'building_style'], 1)

    #Replace feature_ column NAs with 0s
    feature_columns = [a for a in list(architecture_csv.columns) if a.startswith('feature_')]
    architecture_csv[feature_columns] = architecture_csv[feature_columns].fillna(0)

    #Extract dates and switch them to numeric
    architecture_csv.building_date = pd.to_numeric(architecture_csv.building_date.apply(date_clean))
    
    #Remove special accented characters from column names and building_name and building_city
    architecture_csv.columns = [unidecode(text) for text in list(architecture_csv.columns)]
    architecture_csv.building_name = architecture_csv.building_name.apply(letter_replace).apply(unidecode)
    architecture_csv.building_main_style = architecture_csv.building_main_style.apply(style_replace)
    
    return architecture_csv    

In [51]:
#Directories and file lists
photo_dir = '/home/sam/Documents/Architecture/photos/'
html_dir = '/home/sam/Documents/Architecture/buildings/'
html_files = [join(html_dir, f) for f in listdir(html_dir) if isfile(join(html_dir, f))]
photo_files = [join(photo_dir, f) for f in listdir(photo_dir) if isfile(join(photo_dir, f))]
photo_names = [f for f in listdir(photo_dir) if isfile(join(photo_dir, f))]
len(html_files)

7657

In [52]:
# Assemble the databases for individual html files, and the concat them into one main db, then tidy
architecture_csv = tidy_main_table(pd.concat([(building_dataframe(i)) for i in html_files]))

In [53]:
architecture_csv

Unnamed: 0_level_0,building_architect,building_city,building_country,building_date,building_latitude,building_longitude,building_name,feature_abusir-necropolis,feature_acrylic,feature_addition,...,feature_zeche-zollverein,feature_zero-energy,feature_zig-zag,feature_ziggurat,feature_zinc,feature_Oya-stone,building_main_style,building_sub_style,building_main_type,building_sub_type
jpg_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3689713225_537b89d9b4.jpg,,bari,italy,-1200.0,41.194099,16.487419,dolmen-of-bisceglie,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,vernacular,italian,religious,monument
2895586914_fe794fc4ca.jpg,,bari,italy,-1200.0,41.194099,16.487419,dolmen-of-bisceglie,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,vernacular,italian,religious,monument
3690519530_e1b73f4f55.jpg,,bari,italy,-1200.0,41.194099,16.487419,dolmen-of-bisceglie,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,vernacular,italian,religious,monument
4368203413_92c3399cda_z.jpg,,madrid,spain,1926.0,40.442684,-3.699627,moseo-geominero-madrid,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,neoclassical,,public,museum
3625883327_c23d151149_z.jpg,,madrid,spain,1926.0,40.442684,-3.699627,moseo-geominero-madrid,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,neoclassical,,public,museum
16626841870_66ef40bbb4_b.jpg,,madrid,spain,1926.0,40.442684,-3.699627,moseo-geominero-madrid,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,neoclassical,,public,museum
4385858902_d6e9e7c654.jpg,,tampa,united_states,1968.0,27.962130,-82.504959,futuro-house-tampa,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,modern,futurist,residential,single-family
4981284997_04d7c38fea.jpg,,tampa,united_states,1968.0,27.962130,-82.504959,futuro-house-tampa,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,modern,futurist,residential,single-family
4981285055_3e4a18f8c7.jpg,,tampa,united_states,1968.0,27.962130,-82.504959,futuro-house-tampa,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,modern,futurist,residential,single-family
1418888097_715cf79a6b.jpg,,nimes,france,10.0,43.838188,4.356106,maison-carree,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,ancient-roman,,religious,temple


In [54]:
architecture_csv.to_csv('architecture.csv')