In [None]:
# import libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns 
import html
from bs4 import BeautifulSoup
import requests
import os
from selenium import webdriver
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import warnings; warnings.simplefilter('ignore')
from VE_scraper_functions import *
from chromedriver_py import binary_path # this will get you the path variable
from selenium.webdriver.common.by import By
import time
import nltk
nltk.download('punkt')

In [None]:
# import dataset
path = "park_scrape_content_dataset.xlsx"
park_scrape_dataset2 = pd.read_excel(path)
park_scrape_dataset2.head

In [None]:
## convert all text to lowercase to avoid case sensitivity issues
park_scrape_dataset2['content'] = park_scrape_dataset2['content'].str.lower()
## remove harpers ferry
park_scrape_dataset2['content'] = park_scrape_dataset2['content'].str.replace('harpers ferry', '')
park_scrape_dataset2.head

In [None]:
pt_words =  [
       " bus ", "shuttle", "transit", "public transportation", "ferry", "ferry service", "subway", "train", "metro"]

In [None]:
def count_pt_words(text):
    word_counts = {word: text.lower().count(word) for word in pt_words}
    return word_counts
word_counts_df = park_scrape_dataset2['content'].apply(count_pt_words).apply(pd.Series)
word_counts_df['total'] = word_counts_df.sum(axis=1)
word_counts_df.head()

In [None]:
park_scrape_dataset2 = pd.concat([park_scrape_dataset2, word_counts_df], axis=1)
park_scrape_dataset2 = park_scrape_dataset2.drop("content", axis="columns")
park_scrape_dataset2.head

In [None]:
park_scrape_dataset2.to_csv("pttest.csv")

In [None]:
path = "clusterlookup.csv"
clusters = pd.read_csv(path)
clusters.head

In [None]:
park_word_counts = park_scrape_dataset2.groupby('park')[pt_words].sum()
park_word_counts['total'] = park_word_counts.sum(axis=1)
park_word_counts.head

In [None]:
clustersubset = clusters[['Park Alpha', "FinalCluster", "Label"]]
clustersubset.rename(columns={'Park Alpha':"park"}, inplace = True)
clusterparks = pd.merge(park_word_counts, clustersubset, on="park", how="left")
clusterparks['ferrynet'] = clusterparks['ferry'] - clusterparks['ferry service']
clusterparks.head

In [None]:
clusterparks.to_csv("ptbypark.csv")

In [None]:
total_parks_counts = clusterparks.groupby('Label').size().reset_index(name='Total_Parks')
total_parks_counts['Total_Parks'] = total_parks_counts['Total_Parks'].astype(int) 
filtered_df = clusterparks[clusterparks['total'] > 1]
ev_parks_counts = filtered_df.groupby('Label').size().reset_index(name='pt_Parks')
label_counts = pd.merge(total_parks_counts, ev_parks_counts, on='Label', how='left')
label_counts['pt_Parks'].fillna(0, inplace=True)
label_counts['Percent_pt'] = round((label_counts['pt_Parks'] / label_counts['Total_Parks']) * 100, 1)
label_counts['pt_Parks'] = label_counts['pt_Parks'].astype(int)
label_counts.head()

In [None]:
total_row = label_counts.sum(numeric_only=True)
total_row['Label'] = 'Total'
total_counts = label_counts.append(total_row, ignore_index=True)
total_counts['pt_Parks'] = total_counts['pt_Parks'].astype(int)
total_counts['Total_Parks'] = total_counts['Total_Parks'].astype(int)
total_counts.loc[total_counts['Label'] == 'Total', 'Percent_pt'] = round((total_counts.loc[total_counts['Label'] == 'Total', 'pt_Parks'] / total_counts.loc[total_counts['Label'] == 'Total', 'Total_Parks']) * 100, 1)
total_counts.tail()

In [None]:
total_counts.to_csv("ptbycluster.csv")

In [None]:
plt.figure(figsize=(10, 6)) 
plt.bar(label_counts['Label'], label_counts['Percent_pt'], color='#C56C39')
plt.xlabel('Cluster')
plt.ylabel('Percentage of Public Transportation Parks')
plt.title('Percentage of Parks with Public Transportation Information in Each Cluster')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6)) 
bars = plt.bar(label_counts['Label'], label_counts['Percent_pt'], color='#C56C39')

for i, bar in enumerate(bars):
    pt_parks = label_counts['pt_Parks'].iloc[i]
    total_parks = label_counts['Total_Parks'].iloc[i]
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.25,
             f"{pt_parks}/{total_parks}",
             ha='center', va='bottom')
    
##avg_percent_ev = label_counts['Percent_EV'].mean()
##plt.axhline(y=avg_percent_ev, color='red', linestyle='--')

plt.xlabel('Cluster')
plt.ylabel('Percentage of Public Transportation Parks')
plt.title('Percentage of Parks with Public Transportation Information in Each Cluster')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.savefig("ptGraph.png")
plt.show()

In [None]:
averages = {word: park_word_counts[word].mean() for word in pt_words}
plt.figure(figsize=(10, 6))
park_word_counts.boxplot(column=pt_words)
plt.title('Box and Whisker Plot of Transportation Words')
plt.ylabel('Count')
plt.xlabel('Words')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(averages.keys(), averages.values(), color='skyblue')
plt.title('Average Counts of Transportation Words')
plt.xlabel('Words')
plt.ylabel('Average Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()