In [None]:
import pandas as pd
import numpy as np
from operator import itemgetter
from itertools import groupby
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.basemap import Basemap
from adjustText import adjust_text

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

# table with:
#    - accountname of the ski resort
#    - name of the ski resort
#    - state
#    - longitude
#    - latitude
#    - number of followers
#    - number of useful pictures
#    - number of removed pictures
#    - total number of pictures
#    - zip code

skiresorts_table_01 = pd.read_csv('results/stats_table.csv')

skiresorts_table_01 = skiresorts_table_01.values.tolist()

# sort the ski resorts ascending by number of pictures
skiresorts_table_01 = sorted(skiresorts_table_01, key = lambda x: x[8])

skiresorts_table_02 = []

# add a 2 to the first 5 resorts (resorts with the least amount of pictures)
for i in range(0, len(skiresorts_table_01)):
    if i <= 4:
        top_list = skiresorts_table_01[i] + [2]
    else:
        top_list = skiresorts_table_01[i] + [0]       
    skiresorts_table_02.append(top_list)

# sort the ski resorts descending by number of pictures
skiresorts_table_02 = sorted(skiresorts_table_02, key = lambda x: -x[8])

skiresorts_table_03 = []

# add a 1 to the first 5 resorts (resorts with the highest amount of pictures)
for i in range(0, len(skiresorts_table_02)):
    if i <= 4:
        top_list = skiresorts_table_02[i][0:10] + [1]
    else:
        top_list = skiresorts_table_02[i]  
    skiresorts_table_03.append(top_list)

# create lists for the plot
skiresorts = [i[1] for i in skiresorts_table_03]
states = [i[2] for i in skiresorts_table_03]
x_coor = [i[3] for i in skiresorts_table_03]
y_coor = [i[4] for i in skiresorts_table_03]
number_of_followers = [i[5] for i in skiresorts_table_03]
number_of_pictures = [i[8] for i in skiresorts_table_03]
orders = [i[10] for i in skiresorts_table_03]

fig = plt.gcf()
fig.set_size_inches(w = 12, h = 8)

# plot a map of the 'West Coast'
plt.title('\nFigure 1: "Number of pictures on the Instagram accounts of ski resorts in Western US"\n',
          fontsize = 16,
          weight = 'bold',
          style = 'italic')

# set the definitions of the basemap for the rest of the US
map_rest = Basemap(projection= 'merc',
            llcrnrlat = 30,
            urcrnrlat = 50,
            llcrnrlon = -130,
            urcrnrlon = -90,
            lat_ts = 30,
            resolution = 'i')

# built-in map boundaries
map_rest.drawcoastlines()
map_rest.drawlsmask(land_color = 'white', ocean_color = '0.9', lakes = True)
map_rest.drawcountries()
map_rest.drawstates()

# draw parallels and label them
parallels = np.arange(-90.,91.,5.)
map_rest.drawparallels(parallels, labels = [False, True, True, False])

# draw meridians and label them
meridians = np.arange(-180.,181.,10.)
map_rest.drawmeridians(meridians, labels = [True, False, False, True])

# convert longitudes and latitudes to coordinates for the basemap
x_rest, y_rest = map_rest(x_coor, y_coor)

# plot West Coast

texts1 = []
texts2 = []

for x1, y1, state, numpics, skiresort, x2, y2, order in zip(x_coor, y_coor, states, number_of_pictures, skiresorts, x_rest, y_rest, orders):
    if ((state != 'Alaska') and (state != 'Vermont')):
        map_rest.scatter(x1, y1, s = numpics, color = 'red', alpha = 0.25, latlon = True)
        if order == 2:
            texts2.append(plt.text(x2 + 75000,
                                   y2 + 25000,
                                   str(skiresort) + ' (' + str(numpics) + ')',
                                   size = 12,
                                   bbox = dict(facecolor = 'orange', alpha = 0.8),
                                   zorder = 1))
        if order == 1:
            texts1.append(plt.text(x2 + 75000,
                                   y2 + 25000,
                                   str(skiresort) + ' (' + str(numpics) + ')',
                                   size = 12,
                                   bbox = dict(facecolor = 'yellow', alpha = 0.8),
                                   zorder = 2))

plt.show()
            
fig_name = 'results/instagram_activity_skiresorts_Western_US.png'
fig.savefig(fig_name)

In [None]:
# import the picture specific data
account_data_01 = pd.read_csv('results/picture_specific_dataset.csv', parse_dates = [2, 7, 24, 28, 32, 36, 40, 44], dayfirst = True, low_memory = False)

# import the resort specific data
account_data_02 = pd.read_csv('results/resort_specific_dataset.csv', low_memory = False)

# import the stats
accountnames = pd.read_csv('results/stats_table.csv', sep = ',', encoding = "ISO-8859-1")
accountnames.drop(['state',
                   'longitude',
                   'latitude',
                   'followers',
                   'number_useful_pictures',
                   'number_images_trash',
                   'number_images_total',
                   'zip_code'], axis = 1, inplace = True)

account_data_03 = account_data_01.merge(accountnames, on = 'resort', how = 'inner').merge(account_data_02, on = 'accountname', how = 'inner')

print('Number of records in account_data_01:', account_data_01.shape[0])
print('Number of records in account_data_02:', account_data_02.shape[0])
print('Number of records in account_data_03:', account_data_03.shape[0])

In [None]:
# develop new features

# absolute deviation from the average precipitation, maximum - and minimum temperature per resort:
#   - calculate the average precipitation, maximum - and minimum temperature per resort
#   - subtract the average from the actual precipitation, maximum - and minimum temperature
#   - take the absolute value:
#        - in winter temperature is expected to be negatively and precipitation is expected to be positively correlated
#          with the number of likes:
#              -> more people go skiing / snowboarding in the winter when it's cold and snowing, leading to
#                 more activity on Instagram
#        - in summer temperature is expected to be positively and precipitation is expected to be negatively correlated
#          with the number of likes:
#              -> more poeple go biking / hiking in summer when it's warm and dry, leading to more activity on Instagram

# let's create a 'weather-activity' indicator:
#   - in winter weather is good when it's mostly freezing and snowing (let's say maximum temperature below 5)
#   - in summer it's good when it's warm and dry

account_data_03['avg_precipitation'] = account_data_03[['precipitation_mm_1',
                                                        'precipitation_mm_2',
                                                        'precipitation_mm_3',
                                                        'precipitation_mm_4',
                                                        'precipitation_mm_5',
                                                        'precipitation_mm_6',
                                                        'precipitation_mm_7',
                                                        'precipitation_mm_8',
                                                        'precipitation_mm_9',
                                                        'precipitation_mm_10',
                                                        'precipitation_mm_11',
                                                        'precipitation_mm_12']].mean(axis = 1)

account_data_03['avg_min_temp'] = account_data_03[['temp_min_celsius_1',
                                                   'temp_min_celsius_2',
                                                   'temp_min_celsius_3',
                                                   'temp_min_celsius_4',
                                                   'temp_min_celsius_5',
                                                   'temp_min_celsius_6',
                                                   'temp_min_celsius_7',
                                                   'temp_min_celsius_8',
                                                   'temp_min_celsius_9',
                                                   'temp_min_celsius_10',
                                                   'temp_min_celsius_11',
                                                   'temp_min_celsius_12']].mean(axis = 1)

account_data_03['avg_max_temp'] = account_data_03[['temp_max_celsius_1',
                                                   'temp_max_celsius_2',
                                                   'temp_max_celsius_3',
                                                   'temp_max_celsius_4',
                                                   'temp_max_celsius_5',
                                                   'temp_max_celsius_6',
                                                   'temp_max_celsius_7',
                                                   'temp_max_celsius_8',
                                                   'temp_max_celsius_9',
                                                   'temp_max_celsius_10',
                                                   'temp_max_celsius_11',
                                                   'temp_max_celsius_12']].mean(axis = 1)

for i in range(0, 4):
    account_data_03['abs_dev_precipitation_min' + str(i)] = abs(account_data_03['precipitation_mm_min' + str(i)] - account_data_03['avg_precipitation'])
    account_data_03['abs_dev_min_temp_min' + str(i)] = abs(account_data_03['temp_min_celsius_min' + str(i)] - account_data_03['avg_min_temp'])
    account_data_03['abs_dev_max_temp_min' + str(i)] = abs(account_data_03['temp_max_celsius_min' + str(i)] - account_data_03['avg_max_temp'])
    
for i in range(1, 4):
    account_data_03['abs_dev_precipitation_plus' + str(i)] = abs(account_data_03['precipitation_mm_plus' + str(i)] - account_data_03['avg_precipitation'])
    account_data_03['abs_dev_min_temp_plus' + str(i)] = abs(account_data_03['temp_min_celsius_plus' + str(i)] - account_data_03['avg_min_temp'])
    account_data_03['abs_dev_max_temp_plus' + str(i)] = abs(account_data_03['temp_max_celsius_plus' + str(i)] - account_data_03['avg_max_temp'])

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

for i in range(0, len(account_data_02.values.tolist())):
    
    account_temp = account_data_02[account_data_02['accountname'] == accountnames.values.tolist()[i][0]]
    
    account_max = account_temp[['accountname',
                                'temp_max_celsius_1',
                                'temp_max_celsius_2',
                                'temp_max_celsius_3',
                                'temp_max_celsius_4',
                                'temp_max_celsius_5',
                                'temp_max_celsius_6',
                                'temp_max_celsius_7',
                                'temp_max_celsius_8',
                                'temp_max_celsius_9',
                                'temp_max_celsius_10',
                                'temp_max_celsius_11',
                                'temp_max_celsius_12']].T.reset_index()
    account_max.columns = account_max.iloc[0]
    account_max = account_max.iloc[1:]
    account_max['month'] = account_max['accountname'].str[17:]
    account_max = account_max[['month', account_temp.values.tolist()[0][0]]]
    account_max.rename(columns = {account_temp.values.tolist()[0][0]: 'maximum temperature'}, inplace = True)

    account_min = account_temp[['accountname',
                                'temp_min_celsius_1',
                                'temp_min_celsius_2',
                                'temp_min_celsius_3',
                                'temp_min_celsius_4',
                                'temp_min_celsius_5',
                                'temp_min_celsius_6',
                                'temp_min_celsius_7',
                                'temp_min_celsius_8',
                                'temp_min_celsius_9',
                                'temp_min_celsius_10',
                                'temp_min_celsius_11',
                                'temp_min_celsius_12']].T.reset_index()
    account_min.columns = account_min.iloc[0]
    account_min = account_min.iloc[1:]
    account_min['month'] = account_min['accountname'].str[17:]
    account_min = account_min[['month', account_temp.values.tolist()[0][0]]]
    account_min.rename(columns = {account_temp.values.tolist()[0][0]: 'minimum temperature'}, inplace = True)

    account_precipitation = account_temp[['accountname',
                                          'precipitation_mm_1',
                                          'precipitation_mm_2',
                                          'precipitation_mm_3',
                                          'precipitation_mm_4',
                                          'precipitation_mm_5',
                                          'precipitation_mm_6',
                                          'precipitation_mm_7',
                                          'precipitation_mm_8',
                                          'precipitation_mm_9',
                                          'precipitation_mm_10',
                                          'precipitation_mm_11',
                                          'precipitation_mm_12']].T.reset_index()
    account_precipitation.columns = account_precipitation.iloc[0]
    account_precipitation = account_precipitation.iloc[1:]
    account_precipitation['month'] = account_precipitation['accountname'].str[17:]
    account_precipitation = account_precipitation[['month', account_temp.values.tolist()[0][0]]]
    account_precipitation.rename(columns = {account_temp.values.tolist()[0][0]: 'precipitation_mm'}, inplace = True)

    account_weather = account_max.merge(account_min, on = 'month', how = 'inner').merge(account_precipitation, on = 'month', how = 'inner')
    
    avg_min_temp = account_data_03[account_data_03['accountname'] == accountnames.values.tolist()[0][0]]['avg_min_temp'].drop_duplicates(keep = 'first')
    account_weather['average_temp_min'] = avg_min_temp[0]
    
    avg_max_temp = account_data_03[account_data_03['accountname'] == accountnames.values.tolist()[0][0]]['avg_max_temp'].drop_duplicates(keep = 'first')
    account_weather['average_temp_max'] = avg_max_temp[0]

    avg_precipitation = account_data_03[account_data_03['accountname'] == accountnames.values.tolist()[0][0]]['avg_precipitation'].drop_duplicates(keep = 'first')
    account_weather['average_precipitation'] = avg_precipitation[0]

    account_weather.set_index('month', inplace = True)

    # creating the graph

    observations = np.arange(1, 13, 1)

    plt.figure()
    fig = plt.gcf()
    fig.set_size_inches(12, 8)

    minimum, = plt.plot(observations,
                        account_weather['minimum temperature'],
                        color = '#FF9933',
                        linewidth = 0.75,
                        linestyle = '-',
                        zorder = 4)
    
    average_min, = plt.plot(observations,
                            account_weather['average_temp_min'],
                            color = '#FF9933',
                            linewidth = 1.50,
                            linestyle = '--',
                            zorder = 5)
    
    maximum, = plt.plot(observations,
                        account_weather['maximum temperature'],
                        color = '#0099FF',
                        linewidth = 0.75,
                        linestyle = '-',
                        zorder = 4)
    
    average_max, = plt.plot(observations,
                            account_weather['average_temp_max'],
                            color = '#0099FF',
                            linewidth = 1.50,
                            linestyle = '--',
                            zorder = 5)
    
    yticks1 = np.arange(-20, 31, 5)
    yticks1_str = yticks1.astype(str).tolist()
    yticks1_labels = [x + ' \u00B0C' for x in yticks1_str]

    ax1 = plt.gca()
    ax1.set_ylim([-20, 30])
    ax1.set_ylabel('Average temperature in degrees Celsius', fontsize = 11)
    ax1.set_yticks(yticks1)
    ax1.set_yticklabels(yticks1_labels, fontsize = 10)

    yticks2 = np.arange(0, 22, 3)
    yticks2_str = yticks2.astype(str).tolist()
    yticks2_labels = [x + ' mm' for x in yticks2_str]

    ax2 = ax1.twinx()
    
    ax2.set_ylim([0, 21])
    ax2.set_yticks(yticks2)
    ax2.set_yticklabels(yticks2_labels, fontsize = 10)

    precipitation = plt.bar(observations,
                           account_weather['precipitation_mm'],
                           edgecolor = 'green',
                           color = '#CEFFCE',
                           #fc = (0, 0.5, 0, 1),
                           linewidth = 1,
                           zorder = 3)
    
    average_precipitation, = plt.plot(observations,
                                      account_weather['average_precipitation'],
                                      color = 'green',
                                      linewidth = 1.50,
                                      linestyle = '--',
                                      zorder = 4)
    
    ax2.set_ylabel('Precipitation in mm', fontsize = 11)
    
    legend = plt.legend([minimum, average_min, maximum, average_max, precipitation, average_precipitation],
               ['average minimum temperature in degrees Celsius per month',
                'average minimum temperature in degrees Celsius, overall',
                'average maximum temperature in degrees Celsius per month',
                'average maximum temperature in degrees Celsius, overall',
                'average daily precipitation in mm per month',
                'average daily precipitation in mm, overall'],
                loc = 2,
                facecolor = 'white',
                edgecolor = 'black',
                borderaxespad = 1)
    
    ax1.set_xlim(0, 13)
    xticks = np.arange(1, 13, 1)
    ax1.set_xticks(xticks)
    xticks_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ax1.set_xticklabels(xticks_labels, fontsize = 10)
    ax1.grid(color = 'g', linestyle = '--', linewidth = 0.25, zorder = 0)
    ax1.set_xlabel('Month', fontsize = 11, labelpad = 10)
    
    # put ax1 in front of ax2 
    ax1.set_zorder(ax2.get_zorder() + 1)
    ax1.patch.set_visible(False)
    
    plt.title('\n' + str(accountnames.values.tolist()[i][1]) + ' - weather statistics based on 2011 - 2016\n', fontsize = 14)

    plt.show()
    
    filename = 'weather_data/charts/weather_chart_' + str(accountnames.values.tolist()[i][1]) + '.png'
    filename = filename.replace(' ', '_')
    
    fig.savefig(filename)

In [None]:
account_data_04 = account_data_03[(account_data_03['date_post_min0'] >= '2013-09-01') & (account_data_03['date_post_min0'] < '2017-09-01')]
account_data_04.shape[0]

In [None]:
# detrend the number of likes based on the number of active Instagram users found on:
# https://www.statista.com/statistics/253577/number-of-monthly-active-instagram-users/

user_stats = [['2013-01-01',  90],
              ['2013-02-01', 100],
              ['2013-06-01', 130],
              ['2013-09-01', 150],
              ['2014-03-01', 200],
              ['2014-12-01', 300],
              ['2015-09-01', 400],
              ['2016-06-01', 500],
              ['2016-12-01', 600],
              ['2017-04-01', 700],
              ['2017-09-01', 800]]

In [None]:
from scipy.interpolate import interp1d
import datetime

df_users = pd.DataFrame(user_stats)
df_users.columns = ['date', 'users']
df_users['date'] = pd.to_datetime(df_users['date'])
df_users['date'] = df_users['date'].apply(lambda x: x.toordinal())

f = interp1d(df_users['date'], df_users['users'], kind = 'cubic')

base = datetime.datetime(2013, 1, 1)
date_list = [(base + datetime.timedelta(days = x)).toordinal() for x in range(0, 1704)]

new_users = f(date_list)

df_users = pd.DataFrame(new_users, date_list).reset_index()
df_users.columns = ['date', 'users']

def integer_to_datetime(int_date):
    temp = datetime.datetime(1, 1, 1)
    delta = datetime.timedelta(days = int_date - 1)
    return temp + delta

df_users['date'] = df_users['date'].apply(lambda x: integer_to_datetime(x))
df_users = df_users[df_users['date'] >= '2013-09-01']

# let's create an index in order to correct features for the number of active Instagram users
df_users['index_users'] = df_users['users'] / df_users['users'].max()

df_users.set_index('date', inplace = True)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

plt.figure()
fig = plt.gcf()
fig.set_size_inches(10, 7)

plt.title('\nInterpolation of the number of active Instagram users\n', fontsize = 14)
plt.plot(df_users['users'], color = '#FF9933', linewidth = 0.75, linestyle = '-')
plt.grid(color = 'grey', linestyle = '--', linewidth = 0.25)

plt.show()

filename = 'results/interpolation_number_likes.png' 
fig.savefig(filename)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

plt.figure()
fig = plt.gcf()

fig.set_size_inches(10, 7)

plt.title('\nPercentage of users compared to September 2017\n', fontsize = 14)
plt.plot(df_users['index_users'], color = '#FF9933', linewidth = 0.75, linestyle = '-')
plt.grid(color = 'grey', linestyle = '--', linewidth = 0.25)

ax = plt.gca()

plt.ylim([0, 1.05])
yticks_major = np.round(np.linspace(0, 1, 11), 1)
yticks_major_str = (yticks_major * 100).astype(int).astype(str).tolist()
yticks_labels = [x + ' %' for x in yticks_major_str]
ax.set_yticks(yticks_major)
ax.set_yticklabels(yticks_labels, fontsize = 11)

plt.show()

filename = 'results/index_number_users.png'  
fig.savefig(filename)

In [None]:
df_users.head()

In [None]:
df_users.reset_index(inplace = True)
df_users = df_users[['date', 'users', 'index_users']]

account_data_04 = account_data_03.merge(df_users, left_on = 'date_post_min0', right_on = 'date', how = 'inner')

number_with_outliers = account_data_04.shape[0]
account_data_04 = account_data_04[(account_data_04['likes_image'] > 0) & (account_data_04['likes_image'] < 15000)]
number_without_outliers = account_data_04.shape[0]

print('Number of outliers removed:', number_with_outliers - number_without_outliers)

account_data_04['likes_image_corrected'] = account_data_04['likes_image'] / account_data_04['index_users']
account_data_04['comments_image_corrected'] = account_data_04['comments_image'] / account_data_04['index_users']
account_data_04['hashtags_image_corrected'] = account_data_04['hashtags_image'] / account_data_04['index_users']
account_data_04['preserved_avg_snowfall'] = account_data_04['average_yearly_snowfall'] * account_data_04['tss_preservation']

account_data_04['log_likes_image_corrected'] = np.log(account_data_04['likes_image_corrected'])

# update populations to 2017-08-31
#   - US Census 2010 data contains population data by the end of 2010
#   - according to https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?src=bkmk
#     the US population increased by a factor 325.779.795 / 310.605.432 = 1.0488541456029654 in this time period

population_list = ['female_18_29_5_digits', 'male_18_29_5_digits', 'total_18_29_5_digits',
                   'female_30_49_5_digits', 'male_30_49_5_digits', 'total_30_49_5_digits',
                   'female_total_5_digits', 'male_total_5_digits', 'total_total_5_digits',
                   
                   'female_18_29_4_digits', 'male_18_29_4_digits', 'total_18_29_4_digits',
                   'female_30_49_4_digits', 'male_30_49_4_digits', 'total_30_49_4_digits',
                   'female_total_4_digits', 'male_total_4_digits', 'total_total_4_digits',
                   
                   'female_18_29_3_digits', 'male_18_29_3_digits', 'total_18_29_3_digits',
                   'female_30_49_3_digits', 'male_30_49_3_digits', 'total_30_49_3_digits',
                   'female_total_3_digits', 'male_total_3_digits', 'total_total_3_digits',
                   
                   'female_18_29_state', 'male_18_29_state', 'total_18_29_state',
                   'female_30_49_state', 'male_30_49_state', 'total_30_49_state',
                   'female_total_state', 'male_total_state', 'total_total_state']

for population in population_list:
    account_data_04[population] = account_data_04[population] * 1.0488541456029654

# day of the week (Monday = 0 - Sunday = 6)
account_data_04['day_of_the_week'] = account_data_04['date_post_min0'].apply(lambda x: x.weekday())

# workday hours (0) / off hours (1) / weekend (2)
account_data_04['working_hours'] = np.where((account_data_04['day_of_the_week'].isin([0, 1, 2, 3, 4])) &
                                           ((account_data_04['hour_24_local'] >= 8) &
                                            (account_data_04['hour_24_local'] <= 18)), 1, 0)
                                      
account_data_04['off_hours'] = np.where((account_data_04['day_of_the_week'].isin([0, 1, 2, 3, 4])) &
                                       ((account_data_04['hour_24_local'] < 8) |
                                        (account_data_04['hour_24_local'] > 18)), 1, 0)

account_data_04['weekend'] = np.where(account_data_04['day_of_the_week'].isin([5,6]), 1, 0)

# convert day of the week into dummies
account_data_04 = pd.get_dummies(account_data_04, prefix = 'day', columns = ['day_of_the_week'])
account_data_04.rename(columns = {'day_0': 'Monday',
                                  'day_1': 'Tuesday',
                                  'day_2': 'Wednesday',
                                  'day_3': 'Thursday',
                                  'day_4': 'Friday',
                                  'day_5': 'Saturday',
                                  'day_6': 'Sunday'}, inplace = True)

# summer and winter dummies
account_data_04['summer'] = np.where(account_data_04['month'].isin([6, 7, 8, 9]), 1, 0)
account_data_04['winter'] = np.where(account_data_04['month'].isin([11, 12, 1, 2, 3, 4]), 1, 0)

# hash tag dummy
account_data_04['hashtag_dummy'] = np.where(account_data_04['hashtags_image_corrected'] > 0, 1, 0)

print('Number of records in account_data_03:', account_data_03.shape[0])
print('Number of records in account_data_04:', account_data_04.shape[0])

In [None]:
account_data_04[account_data_04['year_month'] > 201706].year_month.value_counts().sum()

In [None]:
print('Number of records in account_data_04 before removing 201707 & 201708:', account_data_04.shape[0])
account_data_04 = account_data_04[account_data_04['year_month'] <= 201706]
print('Number of records in account_data_04 after removing 201707 & 201708:', account_data_04.shape[0])

In [None]:
# check: is the sum of workdays, off hours and weekends equal to one for all the pictures?
(account_data_04['working_hours'] + account_data_04['off_hours'] + account_data_04['weekend']).value_counts()

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
from matplotlib.patches import Rectangle

fig = plt.figure(figsize = (12, 6))
ax1 = fig.add_subplot(111)

plt.title('Histogram of the (log) number of likes', fontsize = 14, y = 1.15)

ax1.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)

likes = plt.hist(account_data_04['likes_image_corrected'],
                 normed = False,
                 bins = 150,
                 color = "#0099FF",
                 ec = "black",
                 alpha = 0.75,
                 linewidth = 0.5,
                 label = 'number of likes')

ax1.set_xlim([0, 20000])
ax1.set_ylim([0, 6000])
ax1.set_xlabel('Number of likes', fontsize = 11, labelpad = 10)
ax1.set_ylabel('Frequency', fontsize = 11)
ax1.set_axisbelow(True)

ax2 = ax1.twiny()

loglikes = plt.hist(account_data_04['log_likes_image_corrected'],
                    normed = False,
                    bins = 50,
                    color = '#FF9933',
                    ec = 'black',
                    alpha = 0.75,
                    linewidth = 0.5,
                    label = 'log of the number of likes')

ax2.set_xlim([0, 16])
ax2.set_xlabel('Log of the number of likes', fontsize = 11,  labelpad = 10)

handles = [Rectangle((0, 0), 1, 1, color = c, ec = 'k') for c in ['#0099FF', '#FF9933']]
labels = ['number of likes', 'log of the number of likes']
plt.legend(handles, labels, borderaxespad = 1)

plt.show()

filename = 'results/histogram_number_likes_corrected.png'
fig.savefig(filename)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

graph_likes = account_data_04[['likes_image', 'likes_image_corrected', 'date']].groupby('date').sum()

plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(12, 6)

plt.title('\nDevelopment of the total number of likes per day for all 80 ski resorts\n', fontsize = 14)

ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)

likes_image, = plt.plot(graph_likes.index,
                        graph_likes['likes_image'],
                        color = '#FF9933',
                        linewidth = 0.75,
                        linestyle = '-')

likes_image_corrected, = plt.plot(graph_likes.index,
                                  graph_likes[['likes_image_corrected']],
                                  color = '#0099FF',
                                  linewidth = 0.75,
                                  linestyle = '-')

plt.legend([likes_image, likes_image_corrected],
           ['number of likes (uncorrected for the number of active Instagram users)',
            'number of likes (corrected for the number of active Instagram users)'], loc = 2, borderaxespad = 1)

plt.xlim(['2013-09-01', '2017-06-30'])
plt.ylim([0, 350000])

ax.set_axisbelow(True)
plt.show()

filename = 'results/time_series_total_number_likes.png'
filename = filename.replace(" ", "_")
    
fig.savefig(filename)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

graph_dates = account_data_04[['date', 'image_id']].groupby('date').count()
graph_dates.rename(columns = {'image_id': 'number_pictures'}, inplace = True)

plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(12, 6)

plt.title('\nNumber of posted pictures (aggregated over the 80 ski resorts) per day\n', fontsize = 14)

ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)

number_pictures, = plt.plot(graph_dates.index, graph_dates['number_pictures'],
                            color = '#FF9933',
                            linewidth = 0.75,
                            linestyle = '-')

plt.xlim(['2013-09-01', '2017-06-30'])
ax.set_axisbelow(True)
plt.ylim([0, 120])
plt.show()

filename = 'results/time_series_number_pictures_per_day.png'
filename = filename.replace(" ", "_")
    
fig.savefig(filename)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

graph_likes = account_data_04[['likes_image', 'likes_image_corrected', 'date']].groupby('date').mean()

plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(12, 6)

plt.title('\nDevelopment of the average number of likes per day for all 80 ski resorts\n', fontsize = 14)

ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)

likes_image, = plt.plot(graph_likes.index, graph_likes['likes_image'], color = '#FF9933', linewidth = 0.75, linestyle = '-', zorder = 3)
likes_image_corrected, = plt.plot(graph_likes.index,
                                  graph_likes['likes_image_corrected'],
                                  color = '#0099FF',
                                  linewidth = 0.75,
                                  linestyle = '-')

plt.legend([likes_image, likes_image_corrected],
           ['number of likes (uncorrected for the number of active Instagram users)',
            'number of likes (corrected for the number of active Instagram users)'], loc = 2)
    
plt.xlim(['2013-09-01', '2017-06-30'])
ax.set_axisbelow(True)
plt.ylim([0, 7500])
plt.show()

filename = 'results/time_series_average_number_likes.png'
filename = filename.replace(" ", "_")
    
fig.savefig(filename)

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

# scatter plot of 'number of likes' and 'days online'
plt.figure(figsize = (15, 15))

ax = plt.gca()

plt.scatter(account_data_04['days_online'], account_data_04['likes_image_corrected'])

ax.set_xlabel('Number of days since the picture has been posted', fontsize = 11, labelpad = 10)
ax.set_ylabel('Number of likes of the picture', fontsize = 11, )

plt.show()

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

plt.figure()
fig, ax = plt.subplots()
fig.set_size_inches(12, 6)

ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)

plt.hist(account_data_04['hashtags_image_corrected'],
         normed = False,
         bins = 100,
         color = "#0099FF",
         ec = "black",
         linewidth = 0.5)

plt.xlim([0, 40])
plt.ylim([0, 10000])
plt.title('\nHistogram of the number of hash tags (corrected)\n', fontsize = 14)
plt.xlabel('Number of hash tags', fontsize = 11, labelpad = 10)
plt.ylabel('Frequency', fontsize = 11)

ax.set_axisbelow(True)
plt.show()

filename = 'results/histogram_number_hashtags_corrected.png'
fig.savefig(filename)

In [None]:
import scipy as sp
import scipy.stats

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

def mean_confidence_interval(data, confidence = 0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * sp.stats.t._ppf((1 + confidence) / 2., n-1)
    return h

features = ['hashtag_dummy', 'working_hours', 'off_hours', 'weekend', 'summer', 'winter',
            'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

for feature in features:
    graph = account_data_04[[feature, 'likes_image_corrected']].groupby(feature).mean().reset_index()
    std_error = account_data_04[[feature, 'likes_image_corrected']].groupby(feature).apply(lambda x: mean_confidence_interval(x)).reset_index()[0].apply(lambda x: x[1])

    plt.figure()
    fig = plt.gcf()

    fig.set_size_inches(8, 5)
    
    plt.bar(graph[feature],
            graph['likes_image_corrected'],
            edgecolor = 'black',
            color = '#0099FF',
            #fc = (0, 0.5, 0, 1),
            linewidth = 0.5,
            alpha = 1,
            width = 0.5,
            yerr = std_error,
            capsize = 7)
    
    xlabel = feature.replace('_', ' ')
    
    plt.ylim([0, 4500])
    plt.title('\nAverage number of likes per group\n', fontsize = 14)
    plt.xlabel(xlabel, fontsize = 11, labelpad = 10)
    plt.ylabel('Number of likes of the picture', fontsize = 11)
    
    xlabels = ['No', 'Yes']
    
    ax = plt.gca()
    ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)
    ax.set_axisbelow(True)
    ax.set_xticks(range(2))
    ax.set_xticklabels(xlabels, fontsize = 10)
    ax.margins(0.2, 0.2)
    
    plt.show()
    
    filename = 'results/' + feature + '.png'
    fig.savefig(filename)

In [None]:
# explore correlations between the number of likes and the non-weather features
import seaborn as sns

plt.figure(figsize = (7, 5))

corr = account_data_04[['likes_image_corrected',
                        'comments_image_corrected',
                        'hashtags_image_corrected',
                        'days_online',
                        'month',
                        'quarter']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. non-weather features\n', fontsize = 14)
sns_plot = sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the resort-specific features
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'followers',
                        'number_images_total',
                        'PAF',
                        'ranking_overall',
                        'ranking_region',
                        'ranking_state',
                        'average_yearly_snowfall',
                        'tss_preservation',
                        'preserved_avg_snowfall']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. resort-specific features\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and 5-digit zip-code populations
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'female_18_29_5_digits',
                        'male_18_29_5_digits',
                        'total_18_29_5_digits',
                        'female_30_49_5_digits',
                        'male_30_49_5_digits',
                        'total_30_49_5_digits',
                        'female_total_5_digits',
                        'male_total_5_digits',
                        'total_total_5_digits',
                        'followers']].corr()

print(corr[['likes_image_corrected', 'followers']])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. 5-digit zip-code populations\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and 4-digit zip-code populations
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'female_18_29_4_digits',
                        'male_18_29_4_digits',
                        'total_18_29_4_digits',
                        'female_30_49_4_digits',
                        'male_30_49_4_digits',
                        'total_30_49_4_digits',
                        'female_total_4_digits',
                        'male_total_4_digits',
                        'total_total_4_digits',
                        'followers']].corr()

print(corr[['likes_image_corrected', 'followers']])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. 4-digit zip-code populations\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and 3-digit zip-code populations
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'female_18_29_3_digits',
                        'male_18_29_3_digits',
                        'total_18_29_3_digits',
                        'female_30_49_3_digits',
                        'male_30_49_3_digits',
                        'total_30_49_3_digits',
                        'female_total_3_digits',
                        'male_total_3_digits',
                        'total_total_3_digits',
                        'followers']].corr()

print(corr[['likes_image_corrected', 'followers']])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. 3-digit zip-code populations\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and state populations
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'female_18_29_state',
                        'male_18_29_state',
                        'total_18_29_state',
                        'female_30_49_state',
                        'male_30_49_state',
                        'total_30_49_state',
                        'female_total_state',
                        'male_total_state',
                        'total_total_state',
                        'followers']].corr()

print(corr[['likes_image_corrected', 'followers']])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. state populations\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the weather features before posting the picture
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'precipitation_mm_min0',
                        'temp_max_celsius_min0',
                        'temp_min_celsius_min0',
                        'precipitation_mm_min1',
                        'temp_max_celsius_min1',
                        'temp_min_celsius_min1',
                        'precipitation_mm_min2',
                        'temp_max_celsius_min2',
                        'temp_min_celsius_min2',
                        'precipitation_mm_min3',
                        'temp_max_celsius_min3',
                        'temp_min_celsius_min3']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. weather features before the post\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the weather-features after posting the picture
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'precipitation_mm_plus1',
                        'temp_max_celsius_plus1',
                        'temp_min_celsius_plus1',
                        'precipitation_mm_plus2',
                        'temp_max_celsius_plus2',
                        'temp_min_celsius_plus2',
                        'date_post_plus3',
                        'precipitation_mm_plus3',
                        'temp_max_celsius_plus3',
                        'temp_min_celsius_plus3']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. weather features after the post\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the resort minimum temperature
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'temp_min_celsius_1',
                        'temp_min_celsius_2',
                        'temp_min_celsius_3',
                        'temp_min_celsius_4',
                        'temp_min_celsius_5',
                        'temp_min_celsius_6',
                        'temp_min_celsius_7',
                        'temp_min_celsius_8',
                        'temp_min_celsius_9',
                        'temp_min_celsius_10',
                        'temp_min_celsius_11',
                        'temp_min_celsius_12']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. resort minimum temperature\n', fontsize = 16)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the resort maximum temperature
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'temp_max_celsius_1',
                        'temp_max_celsius_2',
                        'temp_max_celsius_3',
                        'temp_max_celsius_4',
                        'temp_max_celsius_5',
                        'temp_max_celsius_6',
                        'temp_max_celsius_7',
                        'temp_max_celsius_8',
                        'temp_max_celsius_9',
                        'temp_max_celsius_10',
                        'temp_max_celsius_11',
                        'temp_max_celsius_12']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. resort maximum temperature\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the resort precipitation
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'precipitation_mm_1',
                        'precipitation_mm_2',
                        'precipitation_mm_3',
                        'precipitation_mm_4',
                        'precipitation_mm_5',
                        'precipitation_mm_6',
                        'precipitation_mm_7',
                        'precipitation_mm_8',
                        'precipitation_mm_9',
                        'precipitation_mm_10',
                        'precipitation_mm_11',
                        'precipitation_mm_12']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. resort precipitation\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the weather-features before posting the picture
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'abs_dev_precipitation_min0',
                        'abs_dev_min_temp_min0',
                        'abs_dev_max_temp_min0',
                        'abs_dev_precipitation_min1',
                        'abs_dev_min_temp_min1',
                        'abs_dev_max_temp_min1',
                        'abs_dev_precipitation_min2',
                        'abs_dev_min_temp_min2',
                        'abs_dev_max_temp_min2',
                        'abs_dev_precipitation_min3',
                        'abs_dev_min_temp_min3',
                        'abs_dev_max_temp_min3']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. weather features after the post\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# explore correlations between the number of likes and the weather-features after posting the picture
plt.figure(figsize = (7, 5))
corr = account_data_04[['likes_image_corrected',
                        'abs_dev_precipitation_plus1',
                        'abs_dev_min_temp_plus1',
                        'abs_dev_max_temp_plus1',
                        'abs_dev_precipitation_plus2',
                        'abs_dev_min_temp_plus2',
                        'abs_dev_max_temp_plus2',
                        'abs_dev_precipitation_plus3',
                        'abs_dev_min_temp_plus3',
                        'abs_dev_max_temp_plus3']].corr()

print(corr['likes_image_corrected'])

sns.plt.title('\nCorrelation matrix\n-\nnumber of likes vs. weather features after the post\n', fontsize = 14)
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels = corr.columns.values)
sns.plt.show()

In [None]:
# categorize the number of likes
account_data_04['likes_groups'] = pd.qcut(x = account_data_04['likes_image_corrected'],
                                          q = [0, .2, .4, .6, .8, 1.],
                                          labels = ['5. LL', '4. L', '3. M', '2. H', '1. HH'])

In [None]:
account_data_04['likes_groups'].value_counts()

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

# sanity check, are the average values increasing per group?
graph = account_data_04[['likes_groups', 'likes_image_corrected']].groupby('likes_groups').mean().reset_index()

plt.figure()
fig = plt.gcf()

fig.set_size_inches(8, 5)

groups = range(5)
xlabels = ['HH', 'H', 'M', 'L', 'LL']

plt.bar(groups,
        graph['likes_image_corrected'],
        edgecolor = 'black',
        color = '#0099FF',
        #fc = (0, 0.5, 0, 1),
        linewidth = 0.5)

plt.ylim([0, 10000])
plt.title('\nAverage number of likes per class\n', fontsize = 14)
plt.xlabel('Class', fontsize = 11, labelpad = 10)
plt.ylabel('Average number of likes', fontsize = 11)

ax = plt.gca()
ax.yaxis.grid(color = '#333333', linestyle = '--', linewidth = 0.25)
ax.set_xticks(groups)
ax.set_xticklabels(xlabels, fontsize = 10)

ax.set_axisbelow(True)
plt.show()
    
filename = 'results/histogram_likes_groups.png'
fig.savefig(filename)

In [None]:
final_dataset = account_data_04[['likes_groups',
                                 'likes_image_corrected',
                                 'log_likes_image_corrected',
                                 'resort',
                                 'accountname',
                                 'image_id',
                                 
                                 # dummies
                                 'hashtag_dummy',
                                 'working_hours',
                                 'off_hours',
                                 'summer',
                                 'winter',
                                 'Monday',
                                 'Tuesday',
                                 'Wednesday',
                                 'Thursday',
                                 'Friday',
                                 'Saturday',
                                 'Sunday',
                                 
                                 # resort dummies
                                 'resort_49 Degrees North',
                                 'resort_Alta',
                                 'resort_Alyeska',
                                 'resort_Anthony Lakes Mountain',
                                 'resort_Arapahoe Basin',
                                 'resort_Aspen',
                                 'resort_Aspen Snowmass',
                                 'resort_Bear Valley Mountain',
                                 'resort_Beaver Creek',
                                 'resort_Beaver Mountain',
                                 'resort_Big Sky',
                                 'resort_Bogus Basin',
                                 'resort_Breckenridge',
                                 'resort_Brian Head',
                                 'resort_Bridger Bowl',
                                 'resort_Brighton',
                                 'resort_Brundage Mountain',
                                 'resort_Copper Mountain',
                                 'resort_Crested Butte',
                                 'resort_Crystal Mountain',
                                 'resort_Deer Valley',
                                 'resort_Diamond Peak',
                                 'resort_Discovery Ski Area',
                                 'resort_Dodge Ridge',
                                 'resort_Eagle Point',
                                 'resort_Eaglecrest',
                                 'resort_Eldora Mountain',
                                 'resort_Grand Targhee',
                                 'resort_Heavenly',
                                 'resort_Homewood',
                                 'resort_Jackson Hole',
                                 'resort_Jay Peak',
                                 'resort_Keystone',
                                 'resort_Killington',
                                 'resort_Kirkwood',
                                 'resort_Lookout Pass',
                                 'resort_Lost Trail Powder Mountain',
                                 'resort_Loveland',
                                 'resort_Mad River Glen',
                                 'resort_Mammoth Mountain',
                                 'resort_Monarch Mountain',
                                 'resort_Mount Bachelor',
                                 'resort_Mount Baker',
                                 'resort_Mount Hood Meadows',
                                 'resort_Mount Hood Skibowl',
                                 'resort_Mount Rose',
                                 'resort_Northstar California',
                                 'resort_Park City',
                                 'resort_Pebble Creek',
                                 'resort_Powder Mountain',
                                 'resort_Powderhorn',
                                 'resort_Purgatory',
                                 'resort_Red Lodge Mountain',
                                 'resort_Schweitzer',
                                 'resort_Sierra at Tahoe',
                                 'resort_Silver Mountain',
                                 'resort_Ski Santa Fe',
                                 'resort_Smugglers Notch',
                                 'resort_Snowbasin',
                                 'resort_Snowbird',
                                 'resort_Solitude',
                                 'resort_Squaw Alpine',
                                 'resort_Steamboat',
                                 'resort_Stevens Pass',
                                 'resort_Stowe',
                                 'resort_Sugar Bowl',
                                 'resort_Sugarbush',
                                 'resort_Sun Valley',
                                 'resort_Sundance Resort',
                                 'resort_Sunlight Mountain',
                                 'resort_Tamarack',
                                 'resort_Taos',
                                 'resort_Telluride',
                                 'resort_The Summit at Snoqualmie',
                                 'resort_Timberline Lodge',
                                 'resort_Vail',
                                 'resort_White Pass',
                                 'resort_Whitefish',
                                 'resort_Winter Park',
                                 'resort_Wolf Creek',
                                 
                                 # state dummies
                                 'state_Alaska',
                                 'state_California',
                                 'state_Colorado',
                                 'state_Idaho',
                                 'state_Montana',
                                 'state_Nevada',
                                 'state_New_Mexico',
                                 'state_Oregon',
                                 'state_Utah',
                                 'state_Vermont',
                                 'state_Washington',
                                 'state_Wyoming',

                                 # picture specific
                                 'days_online',
                                 'hour_24_local',
                                 
                                 # resort specific
                                 'followers',
                                 'number_images_total',
                                 'PAF',
                                 'ranking_overall',
                                 'ranking_region',
                                 'ranking_state',
                                 'average_yearly_snowfall',
                                 'tss_preservation',
                                 'preserved_avg_snowfall',
                                 
                                 # population 18-29 per gender and total
                                 'female_18_29_state',
                                 'male_18_29_state',
                                 'total_18_29_state',
                                
                                 # picture specific weather features
                                 'precipitation_mm_min0',
                                 'temp_max_celsius_min0',
                                 'temp_min_celsius_min0',
                                 'precipitation_mm_min1',
                                 'temp_max_celsius_min1',
                                 'temp_min_celsius_min1',
                                 'precipitation_mm_plus1',
                                 'temp_max_celsius_plus1',
                                 'temp_min_celsius_plus1',
                                 
                                 'abs_dev_precipitation_min0',
                                 'abs_dev_min_temp_min0',
                                 'abs_dev_max_temp_min0',
                                 'abs_dev_precipitation_min1',
                                 'abs_dev_min_temp_min1',
                                 'abs_dev_max_temp_min1',
                                 'abs_dev_precipitation_plus1',
                                 'abs_dev_min_temp_plus1',
                                 'abs_dev_max_temp_plus1',                            
                                 
                                 # resort specific weather features
                                 'temp_min_celsius_1',
                                 'temp_min_celsius_2',
                                 'temp_min_celsius_3',
                                 'temp_min_celsius_4',
                                 'temp_min_celsius_5',
                                 'temp_min_celsius_6',
                                 'temp_min_celsius_7',
                                 'temp_min_celsius_8',
                                 'temp_min_celsius_9',
                                 'temp_min_celsius_10',
                                 'temp_min_celsius_11',
                                 'temp_min_celsius_12',
                                 
                                 'temp_max_celsius_1',
                                 'temp_max_celsius_2',
                                 'temp_max_celsius_3',
                                 'temp_max_celsius_4',
                                 'temp_max_celsius_5',
                                 'temp_max_celsius_6',
                                 'temp_max_celsius_7',
                                 'temp_max_celsius_8',
                                 'temp_max_celsius_9',
                                 'temp_max_celsius_10',
                                 'temp_max_celsius_11',
                                 'temp_max_celsius_12',
                                 
                                 'precipitation_mm_1',
                                 'precipitation_mm_2',
                                 'precipitation_mm_3',
                                 'precipitation_mm_4',
                                 'precipitation_mm_5',
                                 'precipitation_mm_6',
                                 'precipitation_mm_7',
                                 'precipitation_mm_8',
                                 'precipitation_mm_9',
                                 'precipitation_mm_10',
                                 'precipitation_mm_11',
                                 'precipitation_mm_12']]

In [None]:
# save the final dataframe to a csv
final_dataset.to_csv('results/dataset_analysis.csv', sep = ',', index = False)