In [None]:
# Data for Utah for addition if time allows.
# the below would go into the 01_data_compiling_cleaning page

<a id='cleanut'></a>
#### Utah User Data

In [601]:
users_1 = pd.read_csv('./data/user_data/utah_trails_users.csv')
users_1.shape

(929, 2)

In [602]:
users_1.head()

Unnamed: 0,trail_name,user_name
0,Thunder Mountain Trail #33098,Stars Average: 4.7 ...
1,Wasatch Crest,Stars Average: 4.8 ...
2,Captain Ahab,Stars Average: 4.7 ...
3,Wire Mesa Loop,Stars Average: 4.6 ...
4,Ramblin',Stars Average: 4.7 ...


#### Cleaning Utah 'user_name' column

In [603]:
for each in range(0,len(users_1['trail_name'])):    
    # remove all the whitespace between names and replace with a comma
    s = re.sub('    \s+',',',users_1['user_name'][each])

    # remove the top two entries ('Stars' and 'Average')
    user_list = (s.split(","))[2:]

    # combine and create a list of tuples for users checked into each specific trail
    info = [(item, users_1['trail_name'][each]) for item in user_list]

    # building into a DataFrame of each user and trail check in 
    # trail_name now repeated when more than one check in present
    trail_checkins = pd.DataFrame(info, columns =['user_name', 'trail_name'])
    
    # saving each trail with check ins to a csv (to be concatenated below)
    trail_checkins.to_csv(f'./data/utah_users/trail_{each}.csv', index = False)

# what to choose files by
extension = 'csv'

# looping through the relative file path to grab all files
all_filenames = [i for i in glob.glob('./data/utah_users/*'.format(extension))]

# combine all files in the list
all_user = pd.concat([pd.read_csv(f) for f in all_filenames ])

# export to new, concatenated csv
# relative file path to were to save new csv
all_user.to_csv( "./data/all_utah_users.csv", index=False)

# Getting a fresh index
all_user.reset_index(inplace = True)

In [604]:
ut_users = pd.read_csv( "./data/all_utah_users.csv")
ut_users.head()

Unnamed: 0,user_name,trail_name
0,MadHamish H,Thunder Mountain Trail #33098
1,Matt Lane,Thunder Mountain Trail #33098
2,Phil Broadbent,Thunder Mountain Trail #33098
3,Jacob Crockett,Thunder Mountain Trail #33098
4,Heather Bond,Thunder Mountain Trail #33098


In [605]:
# verifying no nulls
ut_users.isnull().sum()

user_name     0
trail_name    0
dtype: int64

#### Reading in Utah Trail Data

In [608]:
ut = pd.read_csv('./data/raw_trail_data/utah_trails.csv')
ut.shape

(933, 19)

In [609]:
ut.head()

Unnamed: 0,trail_name,length,difficulty,trail_link,city,popularity,rating,local_club,local_club_site,land_manager,land_manager_site,tot_climb,tot_descent,ave_grade,max_grade,max_elevation,min_elevation,dog_policy,e_bike_policy
0,Thunder Mountain Trail #33098,8.1\n mi,INTERMEDIATE/DIFFICULT,https://www.mtbproject.com/trail/1868735/thund...,"Panguitch, UT",#1,4.7 (87),Trail Alliance of Southern Utah (TASU),https://www.mtbproject.com/club/31800/trail-al...,USFS - Dixie National Forest Office,http://www.fs.usda.gov/dixie,689',"1,681'",6%,27%,"8,263'","7,084'",Off-leash,Unknown
1,Wasatch Crest,12.5\n mi,INTERMEDIATE/DIFFICULT,https://www.mtbproject.com/trail/3654027/wasat...,"Alta, UT",#2,4.8 (110),,,,,"1,084'","2,656'",6%,26%,"9,882'","7,616'",No Dogs,Not Allowed
2,Captain Ahab,4.2\n mi,DIFFICULT,https://www.mtbproject.com/trail/648766/captai...,"Moab, UT",#3,4.7 (128),,,,,326',981',6%,23%,"4,898'","4,034'",Leashed,Not Allowed
3,Wire Mesa Loop,7.4\n mi,INTERMEDIATE/DIFFICULT,https://www.mtbproject.com/trail/7021693/wire-...,"Springdale, UT",#4,4.6 (122),,,,,428',415',2%,12%,"4,502'","4,326'",Unknown,Allowed
4,Ramblin',3.3\n mi,INTERMEDIATE/DIFFICULT,https://www.mtbproject.com/trail/7000569/ramblin,"Moab, UT",#5,4.7 (92),,,,,195',398',3%,12%,"5,279'","4,971'",Leashed,Not Allowed


<a id='trailut'></a>
#### Utah Data Cleaning

**Extra Characters, DTypes, Scaling**

In [619]:
# removing extra characters from 'length' and converting to float
ut['length']= (ut['length'].str.replace('\n    mi','')).astype(float)

# removing extra characters from 'city'
ut['city']= (ut['city'].str.replace(', UT',''))

# removing extra characters from 'rating' replacing 'nan' with 0, and converting to float
ut['rating'] = ut['rating'].astype(str).str[:3].str.replace('nan','0').astype(float) 

# removing '#' from 'arizona_rating' and scaling ranking from 0-1
ut['popularity']= (1-MinMaxScaler().fit_transform(ut['popularity'].str.replace('#','').astype(float).values.reshape(-1,1)))

# removing extra characters and converting to float for climb, descent, and elevation columns
cols = ['tot_climb', 'tot_descent', 'max_elevation', 'min_elevation']
# passing replacement and float type to each column
ut[cols] = ut[cols].replace({",":"", "'":""}, regex=True).astype(float)

# dropping '%' from ave and max_grade columns
cols1 = ['ave_grade', 'max_grade']
# passing replacement and float type to each column
ut[cols1] = ut[cols1].replace({"%":""}, regex=True).astype(float)

In [620]:
# preserve an unaltered version of Utah trail data for filtering dashboard
ut_trails =ut.copy()
ut_trails.to_csv( "./data/trail_dashboard/ut_trails.csv", index=False)

<a id='imputeut'></a>
#### Further Processing for Recommender System
**Imputing Nulls**

In [621]:
ut_trails.isnull().sum()

trail_name             0
length                 0
difficulty             0
trail_link             0
city                   0
popularity             0
rating                 0
local_club           421
local_club_site      421
land_manager         734
land_manager_site    734
tot_climb              0
tot_descent            0
ave_grade              0
max_grade              0
max_elevation          0
min_elevation          0
dog_policy            11
e_bike_policy          0
dtype: int64

**Imputing club, land manager, dog and e-bike missing information with 'Unknown'**

In [622]:
features=['local_club', 'local_club_site', 'land_manager', 'land_manager_site', 'dog_policy', 'e_bike_policy']
for i in features:
       ut_trails[i].fillna('Unknown', inplace = True)

**Utilizing KNN Imputation for various missing trail statistics**

This information is assumed 'Missing at Random'. The data is not systematically different from other points, and it is more likely that user-generated data just hasn't been collected yet. KNN Imputation should do a great job replacing missing data.

In [623]:
# scaling data to bring imputations onto a level playing field
features = ['tot_climb', 'tot_descent', 'ave_grade', 'max_grade', 'max_elevation', 'min_elevation','length', 'rating']
ut_trails[features]= (MinMaxScaler().fit_transform(ut[features]))

imp_features = ['tot_climb', 'tot_descent', 'ave_grade', 'max_grade', 'max_elevation', 'min_elevation']
# utilizing KNNImputer with 5 neighbors to fill in missing data for 'features'.
imputer = KNNImputer(n_neighbors=5)
ut_trails[imp_features] = pd.DataFrame(imputer.fit_transform(ut_trails[imp_features]))

In [624]:
# dropping columns not needed for recommender
ut_trails.drop(columns =['local_club','local_club_site','land_manager','land_manager_site', 'trail_link', 'city'], inplace=True)

In [625]:
ut_trails.head(2)

Unnamed: 0,trail_name,length,difficulty,popularity,rating,tot_climb,tot_descent,ave_grade,max_grade,max_elevation,min_elevation,dog_policy,e_bike_policy
0,Thunder Mountain Trail #33098,0.065165,INTERMEDIATE/DIFFICULT,1.0,0.94,0.052217,0.14821,0.3,0.409091,0.632152,0.539085,Off-leash,Unknown
1,Wasatch Crest,0.100563,INTERMEDIATE/DIFFICULT,0.998922,0.96,0.082152,0.234174,0.3,0.393939,0.817796,0.602479,No Dogs,Not Allowed


**One Hot Encoding (pd.getdummies) 'city', 'dog_policy', 'e_bike_policy'**

In [626]:
# creating a list of features to convert to dummy columns
dummies = pd.get_dummies(ut_trails, columns=['difficulty', 'dog_policy', 'e_bike_policy'])
ut_trails = dummies
ut_trails.columns = map(str.lower, ut_trails.columns)

In [629]:
ut_trails.head(2)

Unnamed: 0,trail_name,length,popularity,rating,tot_climb,tot_descent,ave_grade,max_grade,max_elevation,min_elevation,...,difficulty_intermediate,difficulty_intermediate/difficult,difficulty_very difficult,dog_policy_leashed,dog_policy_no dogs,dog_policy_off-leash,dog_policy_unknown,e_bike_policy_allowed,e_bike_policy_not allowed,e_bike_policy_unknown
0,Thunder Mountain Trail #33098,0.065165,1.0,0.94,0.052217,0.14821,0.3,0.409091,0.632152,0.539085,...,0,1,0,0,0,1,0,0,0,1
1,Wasatch Crest,0.100563,0.998922,0.96,0.082152,0.234174,0.3,0.393939,0.817796,0.602479,...,0,1,0,0,1,0,0,0,1,0


<a id='saveut'></a>
### Saving formatted Utah Trials Dataframe for Modeling

In [627]:
ut_trails.to_csv( "./data/recommender_data/ut_trail_data.csv", index=False)