# Places Analysis
In this notebook we'll do some analysis of the Google "PlaceVisit" data using pandas. We'll plot our results using Matplotlib.

In [None]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
from datetime import date

In [None]:
"""Configuration"""
# Set this to the name of your places CSV file
PLACES_PATH = 'places.csv'
# Optional: configure the start and end dates of data you want to consider, e.g. date(2020, 1, 1)
START_DATE = None
END_DATE = None
# Optional: configure whether to exclude data that is low confidence
EXCLUDE_LOW_CONFIDENCE = False

In [None]:
places = pd.read_csv(PLACES_PATH, sep='|', encoding='utf-8', parse_dates=['start_timestamp', 'end_timestamp'])

# Filter out rows that don't match the configured settings
if START_DATE:
    places = places[places['start_timestamp'].dt.date >= START_DATE]
if END_DATE:
    places = places[places['end_timestamp'].dt.date <= END_DATE]
if EXCLUDE_LOW_CONFIDENCE:
    places = places[places['confidence'] != 'LOW_CONFIDENCE']

print(f'Data has {places.shape[0]} rows and {places.shape[1]} columns')
places.info()

In [None]:
print('Number of records, by level of confidence')
places['confidence'].value_counts()

In [None]:
print('Top ten places, by number of records')
places['name'].value_counts()[:10]

In [None]:
# Calculate time spent per place.
# Note: this may provide seemingly strange results. 
# Personal addresses (where you live) likely don't have a "name", and
# therefore won't show up in the results. To see them, group by 'address'
print('Top ten places, by duration:')
places['duration'] = places['end_timestamp'] - places['start_timestamp']
time_spent = places.groupby('name')['duration'].sum()
time_spent.sort_values(inplace=True, ascending=False)
time_spent[:10]

In [None]:
# Determine the country of each record based on the address.
# The country is the last part of the address, so we apply a lamda
# function that splits the address on the comma and takes the last part of it.
# Note that applying lambdas to a dataframe is generally much slower than using 
# built-in transformations, but it shouldn't matter for a dataset of this size.
places['country'] = places['address'].dropna().apply(lambda address: address.split()[-1])
places['country'].value_counts()

In [None]:
# Now plot the number of records per country
ax = places['country'].value_counts().sort_values().plot.barh()
ax.set_title('Number of PlaceVisits by country')
ax.set_xlabel('')
ax.set_ylabel('')