In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
import numpy as np
import pandas as pd
import urllib
import urllib.parse
import os
import re
import datetime
from math import radians, cos, sin, asin, sqrt

# the following features will be in the picture specific dataset:
#    - number of likes
#    - date of posting
#    - date of scraping
#    - 'age' of the picture in days (how many days the picture has been online)
#    - month in which the picture has been posted
#    - quarter in which the picture has been posted
#    - time of the post (hour of the day ranging from 0 to 23)
#    - number of hash tags extracted from the post, not from the comments
#    - number of comments (possible cannot be used due to data leakage, explain ...)
#    - weather stats (precipitation, minimum temperature and maximum temperature of a day) around the date of posting

# stats_table contains the following features:
#    - name of resort
#    - state in order to merge the weather data
#    - latitude and longitude in order to calculate distance to weather station

# resorts_elevation contains the following features:
#    - name of the resort
#    - minimum and maximum elevation of the resort

# resorts_time_zone contains the following features:
#    - name of the resort
#    - time difference compared to UTC of the resort

account_01 = pd.read_csv('results/stats_table.csv', sep = ',', encoding = "ISO-8859-1")

elevation = pd.read_csv('other_data/resorts_elevation.csv', sep = ',', encoding = "ISO-8859-1")
elevation = elevation[['resort', 'min elevation (m)', 'max elevation (m)']]
elevation.rename(columns = {'min elevation (m)': 'minimum_elevation',
                            'max elevation (m)': 'maximum_elevation'}, inplace = True)

time_zone = pd.read_csv('other_data/resorts_time_zone.csv', sep = ',', encoding = "ISO-8859-1")

account_02 = account_01.merge(elevation, on = 'resort', how = 'inner')
account_03 = account_02.merge(time_zone, on = 'resort', how = 'inner')

account_03.drop(['followers', 'number_useful_pictures', 'number_images_trash', 'number_images_total', 'zip_code'], axis = 1, inplace = True)

print('The number of resorts in account_01 is:', account_01.resort.count())
print('The number of resorts in account_02 is:', account_02.resort.count())
print('The number of resorts in account_03 is:', account_03.resort.count())

# accountname, resort, state, longitude, latitude, minimum_elevation, maximum_elevation, time_difference

In [None]:
account_04 = account_03.values.tolist()

# for some resorts snow- & snow water equivalent data and data regarding weather types were available but not for all,
# so I decided to only use precipitation, maximum and minimum temperature since they were available (with decent coverage)
# for all resorts

df_01 = pd.DataFrame()
account_stats_total = pd.DataFrame()

for i in range(0, len(account_04)):
    
    print('Resort:', account_04[i][1])
    
    # set the URL to the Instagram account
    url = 'https://www.instagram.com/' + str(account_04[i][0])

    # folder of the data
    path = urllib.parse.urlparse(url).path
    folder_name = path.replace('/', '')
    folder_path = 'images/{}/'.format(folder_name)

    account_data = pd.read_csv(os.path.join(folder_path, str(account_04[i][0]) + '_stats.csv'), sep = ',', encoding = "ISO-8859-1")
    image_list_01 = account_data.values.tolist()
    
    image_list_02 = []

    for j in range(len(image_list_01)):
        if os.path.isfile(folder_path + str(image_list_01[j][1]) + str('.jpg')):
            picture_stats = image_list_01[j] + [account_04[i][1]]
            image_list_02.append(picture_stats)

    df_temp = pd.DataFrame(image_list_02, columns = list(account_data.columns) + ['resort'])

    df_temp['likes_image'] = df_temp['likes_image'].astype('int64')
    df_temp['comments_image'] = df_temp['comments_image'].astype('int64')
    
    df_temp['date_post_min0'] = pd.to_datetime(df_temp['date_post'])
    df_temp['date_scrape'] = pd.to_datetime(df_temp['date_scrape'])
    df_temp['days_online'] = df_temp['date_scrape'] - df_temp['date_post_min0']
    df_temp['days_online'] = df_temp['days_online'].astype('timedelta64[D]').astype('int64')
    
    # add information from the overall table
    df_temp['latitude'] = account_04[i][4]
    df_temp['longitude'] = account_04[i][3]
    df_temp['time_difference'] = account_04[i][7]
    
    # extract the time and the time zone from the time_post
    df_temp['time'] = df_temp['time_post'].apply(lambda x: re.search('.+?(?=am|pm)', str(x)).group(0)) # anything before am / pm
    df_temp['am_pm'] = df_temp['time_post'].apply(lambda x: re.search('(?=:)(.*)', str(x)).group(0)[3:5]) # am or pm
    df_temp['time_zone'] = df_temp['time_post'].apply(lambda x: re.search('[^am|pm]*$', str(x)).group(0).strip()) # anything after am / pm

    df_temp['hour_am_pm_UTC'] = df_temp['time'].apply(lambda x: re.search('.+?(?=:)', str(x)).group(0)).astype('int64') # anything before :
    
    # convert am/pm to 24 hour clock time
    
    # - 12 am -> 0
    # - 1 - 11 am -> 1 - 11
    # - 12 pm -> 12
    # - 1 - 11 pm -> 13 - 23
    
    for index, row in df_temp.iterrows():
        if row['am_pm'] == 'am':
            if row['hour_am_pm_UTC'] == 12:
                df_temp.loc[index, 'hour_24_UTC'] = 0
            else:
                df_temp.loc[index, 'hour_24_UTC'] = df_temp.loc[index, 'hour_am_pm_UTC']
        else:
            if row['hour_am_pm_UTC'] == 12:
                df_temp.loc[index, 'hour_24_UTC'] = 12
            else:
                df_temp.loc[index, 'hour_24_UTC'] = df_temp.loc[index, 'hour_am_pm_UTC'] + 12
    
    df_temp['hour_24_UTC'] = df_temp['hour_24_UTC'].astype('int64')
    
    # apply the time difference to convert the time from UTC to local time while taking daylight saving time into account
    
    for index, row in df_temp.iterrows():
        if row['time_zone'] == 'UTC':
            df_temp.loc[index, 'hour_24_local'] = df_temp.loc[index, 'hour_24_UTC'] + df_temp.loc[index, 'time_difference']
            
            # correct for daylight saving time (time difference is one hour more from first Sunday in November till
            # second Sunday in March)
            if (((row['date_post_min0'].date() >= datetime.date(2009, 11, 1)) & (row['date_post_min0'].date() < datetime.date(2010, 3, 14))) |
                ((row['date_post_min0'].date() >= datetime.date(2010, 11, 7)) & (row['date_post_min0'].date() < datetime.date(2011, 3, 13))) |
                ((row['date_post_min0'].date() >= datetime.date(2011, 11, 6)) & (row['date_post_min0'].date() < datetime.date(2012, 3, 11))) |
                ((row['date_post_min0'].date() >= datetime.date(2102, 11, 4)) & (row['date_post_min0'].date() < datetime.date(2013, 3, 10))) |
                ((row['date_post_min0'].date() >= datetime.date(2013, 11, 3)) & (row['date_post_min0'].date() < datetime.date(2014, 3,  9))) |
                ((row['date_post_min0'].date() >= datetime.date(2014, 11, 2)) & (row['date_post_min0'].date() < datetime.date(2015, 3,  8))) |
                ((row['date_post_min0'].date() >= datetime.date(2015, 11, 1)) & (row['date_post_min0'].date() < datetime.date(2016, 3, 13))) |
                ((row['date_post_min0'].date() >= datetime.date(2016, 11, 6)) & (row['date_post_min0'].date() < datetime.date(2017, 3, 12))) |
                ((row['date_post_min0'].date() >= datetime.date(2017, 11, 5)) & (row['date_post_min0'].date() < datetime.date(2018, 3, 11)))):
                
                df_temp.loc[index, 'hour_24_local'] = df_temp.loc[index, 'hour_24_local'] - 1
            
            if df_temp.loc[index, 'hour_24_local'] < 0:
                df_temp.loc[index, 'hour_24_local'] = df_temp.loc[index, 'hour_24_local'] + 24
                df_temp.loc[index, 'date_post_min0'] = df_temp.loc[index, 'date_post_min0'] - pd.to_timedelta(1, unit = 'd')
        else:
            df_temp.loc[index, 'hour_24_local'] = -99

    df_temp['hour_24_local'] = df_temp['hour_24_local'].astype('int64')

    # create year-month
    
    df_temp['year_month'] = df_temp['date_post_min0'].apply(lambda x: x.year) * 100 + df_temp['date_post_min0'].apply(lambda x: x.month)
    df_temp['month'] = df_temp['date_post_min0'].apply(lambda x: x.month)
    df_temp['quarter'] = df_temp['date_post_min0'].dt.quarter
        
    df_temp.drop(['date_post', 'followers_account', 'description', 'title_info', 'time_post', 'video_link', 'image_link'], axis = 1, inplace = True)

    number_pictures_before = df_temp.shape[0]
    print('The number of pictures of', account_04[i][1], 'before merging to the weather data is', number_pictures_before)
    
    # PRCP = Precipitation (inches)
    # TMAX = Maximum temperature (degrees Fahrenheit)
    # TMIN = Minimum temperature (degrees Fahrenheit)
    
    weather_data = pd.read_csv('weather_data/weather_' + str(account_04[i][2]) + '.csv', parse_dates = [5], dayfirst = True, low_memory = False)
    weather_data['latitude_resort'] = account_04[i][4]
    weather_data['longitude_resort'] = account_04[i][3]
    weather_data['min_elevation'] = account_04[i][5]
    weather_data['max_elevation'] = account_04[i][6]
    
    # there are some 'weird' values -> remove this low quality data (Alaska has some temperatures of -148 degrees F ...)
    weather_data = weather_data[weather_data['TMAX'] > -147]
    weather_data = weather_data[weather_data['TMIN'] > -147]

    # remove duplicates
    weather_data.drop_duplicates(keep = 'first', inplace = True)
    
    # calculate the distance from the resort to every weather station in the file
    
    def haversine(lon1, lat1, lon2, lat2):

        # convert decimal degrees to radians
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        
        # haversine formula
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371 # radius of earth in kilometers (use 3956 for miles)
        
        return c * r
    
    weather_data['DISTANCE'] = weather_data.apply(lambda row: haversine(lon1 = row['LONGITUDE'],
                                                                        lat1 = row['LATITUDE'],
                                                                        lon2 = row['longitude_resort'],
                                                                        lat2 = row['latitude_resort']), axis = 1)
    
    # calculate the difference in elevation between the weather station and mid-mountain of the resort,
    # a positive number indicates a higher elevation of the weather station
    weather_data['ELEVATION_DIFFERENCE'] = (weather_data['ELEVATION'] - ((weather_data['max_elevation'] + weather_data['min_elevation']) / 2)) / 1000

    weather_data['DATE'] = pd.to_datetime(weather_data['DATE'])
    weather_data = weather_data[['STATION', 'DATE', 'PRCP', 'TMAX', 'TMIN', 'DISTANCE', 'ELEVATION_DIFFERENCE']]
    weather_data.columns = ['station', 'date', 'precipitation_inches', 'temp_max', 'temp_min', 'distance', 'elevation_difference']

    # convert precipitation to mm and temperatures to degrees Celsius
    weather_data['precipitation_mm'] = weather_data['precipitation_inches'] * 2.54 * 10
    weather_data['temp_max_celsius'] = (weather_data['temp_max'] - 32) * 5 / 9
    weather_data['temp_min_celsius'] = (weather_data['temp_min'] - 32) * 5 / 9

    account_stats_resort = pd.DataFrame()
    
    # add weather statistics for days before posting
    for k in range(0, 4):
        if k > 0:
            df_temp['date_post_min' + str(k)] = df_temp['date_post_min0'] - pd.to_timedelta(k, unit = 'd')
        
        weather_stats = ['precipitation_mm', 'temp_max_celsius', 'temp_min_celsius']
        
        for weather_stat in weather_stats:
            
            # select closest available observation
            min_per_day = weather_data[weather_data[weather_stat].notnull()].groupby('date')['distance'].min().to_frame().reset_index()
            
            weather_data_optimum = weather_data.merge(min_per_day,
                                                      left_on = ['date', 'distance'],
                                                      right_on = ['date', 'distance'],
                                                      how = 'inner')
            
            # remove missing values
            weather_data_optimum = weather_data_optimum[weather_data_optimum[weather_stat].notnull()]
            
            # in case duplicates still exist, let's keep the largest value
            weather_data_optimum.sort_values(by = ['date', 'distance', 'elevation_difference', weather_stat], ascending = False, inplace = True)
            weather_data_optimum.drop_duplicates(subset = ['date', 'distance', 'elevation_difference'], keep = 'first', inplace = True)
            
            weather_data_optimum = weather_data_optimum[['date', 'distance', 'elevation_difference', weather_stat]]
            
            # correct the temperature for the difference in elevation (6 degrees celsius per 1000 meters)
            if (weather_stat == 'temp_max_celsius') | (weather_stat == 'temp_min_celsius'):
                weather_data_optimum[weather_stat] = weather_data_optimum[weather_stat] + weather_data_optimum['elevation_difference'] * 6

            df_temp = df_temp.merge(weather_data_optimum,
                                    left_on = 'date_post_min' + str(k),
                                    right_on = 'date',
                                    how = 'left')

            df_temp.rename(columns = {weather_stat: weather_stat + '_min' + str(k)}, inplace = True)   
            
            # save both minimum and maximum distance as well as minimum and maximum elevation difference per weather statistic 
            min_distance = df_temp['distance'].min()
            max_distance = df_temp['distance'].max()
            
            account_stats_temp = [account_04[i][1],
                                  weather_stat + '_min' + str(k),
                                  min_distance,
                                  max_distance]

            account_stats_temp_df = pd.DataFrame(account_stats_temp).T
        
            account_stats_resort = pd.concat([account_stats_resort, account_stats_temp_df], axis = 0)
        
            df_temp.drop(['date', 'distance', 'elevation_difference'], axis = 1, inplace = True)
            
    # add weather statistics for days before posting
    for k in range(1, 4):
        df_temp['date_post_plus' + str(k)] = df_temp['date_post_min0'] + pd.to_timedelta(k, unit = 'd')
        
        weather_stats = ['precipitation_mm', 'temp_max_celsius', 'temp_min_celsius']
        
        for weather_stat in weather_stats:
                        
            # select closest available observation
            min_per_day = weather_data[weather_data[weather_stat].notnull()].groupby('date')['distance'].min().to_frame().reset_index()
            
            weather_data_optimum = weather_data.merge(min_per_day,
                                                      left_on = ['date', 'distance'],
                                                      right_on = ['date', 'distance'],
                                                      how = 'inner')
            
            # remove missing values
            weather_data_optimum = weather_data_optimum[weather_data_optimum[weather_stat].notnull()]
            
            # in case duplicates still exist, let's keep the largest value
            weather_data_optimum.sort_values(by = ['date', 'distance', 'elevation_difference', weather_stat], ascending = False, inplace = True)
            weather_data_optimum.drop_duplicates(subset = ['date', 'distance', 'elevation_difference'], keep = 'first', inplace = True)

            weather_data_optimum = weather_data_optimum[['date', 'distance', 'elevation_difference', weather_stat]]
            
            # correct the temperature for the difference in elevation (6 degrees celsius per 1000 meters)
            if (weather_stat == 'temp_max_celsius') | (weather_stat == 'temp_min_celsius'):
                weather_data_optimum[weather_stat] = weather_data_optimum[weather_stat] + weather_data_optimum['elevation_difference'] * 6

            df_temp = df_temp.merge(weather_data_optimum,
                                    left_on = 'date_post_plus' + str(k),
                                    right_on = 'date',
                                    how = 'left')

            df_temp.rename(columns = {weather_stat: weather_stat + '_plus' + str(k)}, inplace = True)   
            
            # save both minimum and maximum distance as well as minimum and maximum elevation difference per weather statistic 
            min_distance = df_temp['distance'].min()
            max_distance = df_temp['distance'].max() 
            
            account_stats_temp = [account_04[i][1],
                                  weather_stat + '_plus' + str(k),
                                  min_distance,
                                  max_distance]
    
            account_stats_temp_df = pd.DataFrame(account_stats_temp).T
        
            account_stats_resort = pd.concat([account_stats_resort, account_stats_temp_df], axis = 0)
        
            df_temp.drop(['date', 'distance', 'elevation_difference'], axis = 1, inplace = True)
            
    number_pictures_after = df_temp.shape[0]
    print('The number of pictures of', account_04[i][1], 'after merging to the weather data is', number_pictures_after)
    
    # print the number of missings in the data
    print('There are', df_temp.isnull().sum().sum(), 'missings.')

    df_01 = pd.concat([df_01, df_temp], axis = 0)
    account_stats_total = pd.concat([account_stats_total, account_stats_resort], axis = 0)

    # if the number of records is different than before the analysis, the program will stop
    if number_pictures_before == number_pictures_after:
        print('   OK!\n')
    else:
        exit()

account_stats_total.columns = ['resort',
                               'weather statistic',
                               'minimum distance',
                               'maximum distance']

In [None]:
# final check: does the dataset contain duplicates?
[g for _, g in df_01.groupby("image_id") if len(g) > 1]

In [None]:
# check the maximum distance per resort
account_stats_total.head()

In [None]:
# save the statistics to a csv
account_stats_total.to_csv('results/account_statistics_pictures.csv', sep = ',', index = False)

In [None]:
# what is the shape of the dataset?
df_01.shape

In [None]:
# check the oldest and newest picture for each resort
df_min = df_01.groupby('resort')['date_post_min0'].min().to_frame()
df_min.rename(columns = {'date_post_min0': 'first_post'}, inplace = True)
df_max = df_01.groupby('resort')['date_post_min0'].max().to_frame()
df_max.rename(columns = {'date_post_min0': 'last_post'}, inplace = True)
df_date = pd.merge(df_min, df_max, left_index = True, right_index = True)

df_date

In [None]:
# how many missings does the final dataframe have?
df_01.isnull().sum().sum()

In [None]:
# select pictures that have been online for at least 10 days
#    -> because several 'scraping batches' have been used, some pictures will have missing data otherwise
#    -> 84 pictures will be removed

df_02 = df_01[df_01['days_online'] >= 10]

print('Number of pictures in df_01:', df_01.shape[0])
print('Number of pictures in df_02:', df_02.shape[0])
print('Number of missing values in df_02:', df_02.isnull().sum().sum())

In [None]:
# preview of the dataset
df_02.head()

In [None]:
# save the final dataframe to a csv
df_02.to_csv('results/picture_specific_dataset.csv', sep = ',', index = False)