In [1]:
# This notebook merges the curated dataset (only technical rock climbs) with the set of MP user ratings

import pandas as pd
import numpy as np
import glob

In [2]:
# my curated data, without rating data
nr_df = pd.read_pickle('Curated_OpenBetaAug2020_RytherAnderson.pkl.zip')
nr_df['route_ID'] = nr_df['route_ID'].astype(int)

In [3]:
# all the rating files are concatenated

rating_files = glob.glob('../ratings/*.zip')
AR = pd.read_csv(rating_files[0], compression='zip')

for rf in rating_files[1:]:
    df = pd.read_csv(rf, compression='zip')
    join = [AR, df]
    AR = pd.concat(join)
    
AR.to_pickle('All_Ratings.pkl.zip', compression='zip')
AR = pd.read_pickle('All_Ratings.pkl.zip', compression='zip')
unique_ids = set(AR.route_id)
print(len(unique_ids), 'routes with star ratings')

99682 routes with star ratings


In [4]:
# this cell takes a long time, evaluate only if you are ready, probably I wrote it in a silly way
# here the list of star ratings and users are added to each route

ratings_df = nr_df[nr_df['route_ID'].isin(unique_ids)].copy()
ratings_df['ratings'] = [list(AR[AR.route_id == id]['ratings']) for id in ratings_df.route_ID]
ratings_df['users'] = [list(AR[AR.route_id == id]['users']) for id in ratings_df.route_ID]

ratings_df.head()

In [28]:
# some ratings lists have duplicated sets of ratings (all ratings in the Flatirons, for example), here those
# are found and fixed. The lists that have duplicated sets also have duplicated users, so it is easy to check.

fixed_ratings_df = ratings_df.copy()

def check_replication(users, ratings):
    
    rating_dict = dict((u,r) for u,r in zip(users, ratings))
    unique_users = list(set(users))
    unique_ratings = [rating_dict[u] for u in unique_users]
    
    return [(u,r) for u,r in zip(unique_users, unique_ratings)]

fixed_ratings_df['corrected_users_ratings'] = fixed_ratings_df.apply(lambda row: check_replication(row['users'], row['ratings']), axis = 1)

fixed_ratings_df = fixed_ratings_df[['route_name', 'parent_sector', 'route_ID', 'sector_ID', 
                                     'type_string', 'fa', 'YDS', 'Vermin', 'nopm_YDS', 'nopm_Vermin', 'YDS_rank', 'Vermin_rank', 
                                     'safety', 'parent_loc', 'description', 'location', 'protection', 'corrected_users_ratings']].copy()

fixed_ratings_df.to_pickle('CuratedWithRatings_OpenBetaAug2020_RytherAnderson.pkl.zip', compression='zip')

In [29]:
fixed_ratings_df.columns

Index(['route_name', 'parent_sector', 'route_ID', 'sector_ID', 'type_string',
       'fa', 'YDS', 'Vermin', 'nopm_YDS', 'nopm_Vermin', 'YDS_rank',
       'Vermin_rank', 'safety', 'parent_loc', 'description', 'location',
       'protection', 'ratings', 'users', 'corrected_users_ratings'],
      dtype='object')