# Places Analysis
In this notebook we'll do some analysis of the Google "PlaceVisit" data using pandas. We'll plot our results using Matplotlib.

In [12]:
%matplotlib inline
import pandas as pd
from datetime import date

In [45]:
"""Configuration"""
# Set this to the name of your places CSV file
PLACES_PATH = 'places.csv'
# Optional: configure the start and end dates of data you want to consider, e.g. date(2020, 1, 1)
START_DATE = None
END_DATE = None
# Optional: configure whether to exclude data that is low confidence
EXCLUDE_LOW_CONFIDENCE = False

In [46]:
places = pd.read_csv(PLACES_PATH, sep='|', encoding='utf-8', parse_dates=['start_timestamp', 'end_timestamp'])

# Filter out rows that don't match the configured settings
if START_DATE:
    places = places[places['start_timestamp'].dt.date >= START_DATE]
if END_DATE:
    places = places[places['end_timestamp'].dt.date <= END_DATE]
if EXCLUDE_LOW_CONFIDENCE:
    places = places[places['confidence'] != 'LOW_CONFIDENCE']

print(f'Data has {places.shape[0]} rows and {places.shape[1]} columns')
places.info()

Data has 2402 rows and 8 columns
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2402 entries, 0 to 2585
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   lat_e7           2402 non-null   int64              
 1   lon_e7           2402 non-null   int64              
 2   address          2399 non-null   object             
 3   name             1293 non-null   object             
 4   place_id         2402 non-null   object             
 5   start_timestamp  2402 non-null   datetime64[ns, UTC]
 6   end_timestamp    2402 non-null   datetime64[ns, UTC]
 7   confidence       2402 non-null   object             
dtypes: datetime64[ns, UTC](2), int64(2), object(4)
memory usage: 168.9+ KB


Unnamed: 0,lat_e7,lon_e7,address,name,place_id,start_timestamp,end_timestamp,confidence
0,423563340,-710623650,"Tremont St & Park Street &, Winter St, Boston,...",Park Street,ChIJOVayyJxw44kR5912flrB2lo,2020-02-29 16:06:02+00:00,2020-02-29 16:07:11+00:00,MEDIUM_CONFIDENCE
1,423554331,-710640442,"115 Boylston St, Boston, MA 02116, USA",Boston Common,ChIJKyNuvJ1w44kRBE-pe_yDhj8,2020-02-29 16:07:11+00:00,2020-02-29 19:32:15.976000+00:00,MEDIUM_CONFIDENCE
2,423514766,-710606782,"52 Beach St, Boston, MA 02111, USA",Gourmet Dumpling House,ChIJKzmwPXh644kRI5Te9D3YWBE,2020-02-29 19:32:15.976000+00:00,2020-02-29 21:29:18.718000+00:00,MEDIUM_CONFIDENCE
3,424309368,-713112209,"51 Sandy Pond Rd, Lincoln, MA 01773, USA",deCordova Sculpture Park and Museum,ChIJaUcc1Qqb44kRCHDCI4YhVhQ,2020-06-06 18:12:18.624000+00:00,2020-06-06 18:50:50.723000+00:00,HIGH_CONFIDENCE
4,424609243,-711637293,"368 Cambridge Rd, Woburn, MA 01801, USA",Horn Pond Plaza,ChIJf_EUYMF144kRSohLVwws91Y,2020-06-20 18:48:09.499000+00:00,2020-06-20 20:04:03.594000+00:00,HIGH_CONFIDENCE


In [47]:
print('Number of records, by level of confidence')
places['confidence'].value_counts()

Number of records, by level of confidence


HIGH_CONFIDENCE      1565
MEDIUM_CONFIDENCE     837
Name: confidence, dtype: int64

In [41]:
print('Top ten places, by number of records')
places['name'].value_counts()[:10]

Top ten places, by number of records


Tennis club Augsburg e.V.      135
rutaNatur                       56
GALERIA Augsburg                54
GALERIA (Karstadt) Augsburg     49
Munich Central Station          46
Augsburg                        41
Augsburg Bohus Center           28
QPLIX GmbH                      26
REWE                            26
Königsplatz                     25
Name: name, dtype: int64

In [42]:
# Calculate time spent per place.
# Note: this may provide seemingly strange results. 
# Personal addresses (where you live) likely don't have a "name", and
# therefore won't show up in the results. To see them, group by 'address'
print('Top ten places, by duration:')
places['duration'] = places['end_timestamp'] - places['start_timestamp']
time_spent = places.groupby('name')['duration'].sum()
time_spent.sort_values(inplace=True, ascending=False)
time_spent[:10]

Top ten places, by duration:


name
Tennis club Augsburg e.V.             8 days 23:42:13.158000
QPLIX GmbH                            4 days 16:18:07.514000
Hôtel La Bastide Saint Martin         2 days 01:17:25.617000
Hotel Royal Plaza                     2 days 00:59:05.271000
Hôtel Restaurant Campanile Manosque   1 days 22:07:09.385000
Augsburg Bohus Center                 1 days 09:23:16.059000
Outdoorhotel Jäger von Fall           1 days 08:11:10.960000
Residenza Elisabetta                  1 days 07:53:32.441000
Résidence Bellevue                    1 days 02:23:10.201000
Hôtel & Spa Les Mouettes              0 days 22:10:40.309000
Name: duration, dtype: timedelta64[ns]

In [None]:
# TODO: countries