In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df = pd.read_csv('/kaggle/input/bournemouth-venues/bournemouth_venues.csv')
# Any results you write to the current directory are saved as output.

In [None]:
print(df.head())

**Creating new columns**
---
I wanted to create more generic categories for each of the venues - the current categories were very specific. The next few blocks of code split up the different values in `df["Venue Categories"]` into their own dataframes. These lists of "extras" are mostly values in `df["Venue Categories"]` that do not contain a word or phrase that is easily picked out using `.str.contains()` method, with the exception of `restaurant_extras`, which needed `[...,'Indian Restaurant', 'English Restaurant',...]` in order to create a legible scatter plot - I will explain more later.

In [None]:
venues = df['Venue Category'] # variable so I don't have to type as much later
#  Lists of Categories for mapping the new column "Venue General Categories"
restaurant_extras = ['Sandwich Place', 'Diner', 'Pizza Place', 'Noodle House', 'Burger Joint', 'Indian Restaurant', 'English Restaurant', 'Fast Food Restaurant', 'French Restaurant']
cafe_extras = ['Coffee Shop', 'Ice Cream Shop', 'Café', 'Bubble Tea Shop', 'Dessert Shop']
bar_extras = ['Pub', 'Nightclub', 'Brewery', ]
indoor_recreation_extras = ['Multiplex', 'Theater', 'Arts & Entertainment']
outdoor_recreation_extras = ['Park', 'Plaza', 'Beach', 'Garden', 'Other Great Outdoors', 'Scenic Lookout']
educational_extras = ['Art Museum', 'Aquarium']
retail_extras = ['Clothing Store', 'Grocery Store']
transit_extras = ['Train Station', 'Bus Stop', 'Platform']

**Creating new dataframes**
---
In order to get the "Venue Categories" and the values isolated in a way that I can easily rename, I identified 11 general categories:

`['Restaurant Group 1', 'Restaurant Group 2', 'Cafe', 'Bar', 'Indoor Recreation', 'Outdoor Recreation', 'Educational',
                  'Retail', 'Gym', 'Transit', 'Hotel']`

*I had to split the "Restaurants" category into 2 groups. We'll get there! Not ready to explain yet!*

Now I'm ready to assign the rows from the original dataframe to their new dataframes by using logical statements to select specific rows. Mostly, I'm using `.isin()` and `.str.contains()`. 

`restaurant_data_2` is a tricky one - we're checking to find the rows that are **not** in both `restaurant_data_2` and `restaurant_data_1`, and dropping the ones that **are** in both. This is because the group had too many different categories to color-code clearly.

In [None]:
#  Creating individual dataframes for each group to be used in FacetGrid()
restaurant_data_1 = df[venues.isin(restaurant_extras)]
restaurant_data_2 = df[venues.str.contains('Restaurant')]
restaurant_data_2 = restaurant_data_2[~restaurant_data_2.isin(restaurant_data_1)].dropna() # The '~' symbol flips the booleans
cafe_data = df[venues.isin(cafe_extras)]
bar_data = df[(venues.str.contains('Bar')) | (venues.isin(bar_extras))]
indoor_recreation_data = df[venues.isin(indoor_recreation_extras)]
outdoor_recreation_data = df[venues.isin(outdoor_recreation_extras)]
educational_data = df[venues.isin(educational_extras)]
retail_data = df[venues.isin(retail_extras)]
gym_data = df[venues.str.contains('Gym')]
transit_data = df[venues.isin(transit_extras)]
hotel_data = df[venues.str.contains('Hotel')]

In [None]:
#  Prepping variables for looping
dataframe_list = [restaurant_data_1, restaurant_data_2, cafe_data, bar_data, indoor_recreation_data, outdoor_recreation_data,
         educational_data, retail_data, gym_data, transit_data, hotel_data]
dataframe_names = ['Restaurant Group 1', 'Restaurant Group 2', 'Cafe', 'Bar', 'Indoor Recreation', 'Outdoor Recreation', 'Educational',
                  'Retail', 'Gym', 'Transit', 'Hotel']

#  Check that the lists are of equal length
if len(dataframe_list) - len(dataframe_names) != 0:
    print('ERROR: Number of dataframes is not equal to number of dataframe names')

**Finally creating the new column**
---
Now that we've done all the work above, it's time to loop through each new dataframe and add it to `df['Venue General Category']` with the general category value from `dataframe_names`.

In [None]:
#  Rename rows with the General Category Names using a for loop
df['Venue General Category'] = df['Venue Category']
for i in range(len(dataframe_list)):
    df['Venue General Category'] = df.apply(lambda row: dataframe_names[i] \
                                        if row['Venue General Category'] in (list(dataframe_list[i]['Venue Category'])) \
                                        else row['Venue General Category'], \
                                        axis=1)
#print(df['Venue General Category'].head(10))

#  Checking to make sure I didn't miss any categories
#frames = [restaurant_data, cafe_data, bar_data, indoor_recreation_data, outdoor_recreation_data,
#         educational_data, retail_data, gym_data, transit_data, hotel_data]
#all_data = pd.concat(frames)

#list1 = all_data['Venue Category'].unique()
#list2 = df['Venue Category'].unique()
#print(list(set(list2) - set(list1)))


**Our scatterplot**
---
Here's our scatterplot. Using seaborn and matplotlib, I've created a facet grid that contains a scatterplot of venue locations for each of the different general categories. From these plots, we can make a few observations:

* The area surrounding (-1.885, 50.7200) seems to be the most dense area in the city
* The city has many more outdoor recreation activities than indoor recreation activities
* Hotels are distributed mostly around the denser areas in the city.
* The city seems to lack many public transit options

In [None]:
def scatter_plot_facetgrid_by_category(data, x_col, y_col, color_by):
    sns.set(style='whitegrid')
    f, ax = plt.subplots(figsize=(20, 60))
    for i in range(len(dataframe_list)):
        plt.subplot(6, 2, i+1)
        plt.title(dataframe_names[i])
        g = sns.scatterplot(data=dataframe_list[i], x=x_col, y=y_col, hue=color_by, legend='full', palette='Paired', s=150)
        g.set(xlim=(min(df[x_col]), max(df[x_col])), ylim=(min(df[y_col]), max(df[y_col])))

In [None]:
scatter_plot_facetgrid_by_category(data=df, x_col='Venue Longitude', y_col='Venue Latitude', 
                               color_by='Venue Category')
plt.show()