In [2]:
# Importing my dependencies
# Allows me to work with csv files - my source data is in csv
import csv
# My source data will be saved as a pandas dataframe for the purpose of transformation
import pandas as pd
# this will allow me to convert longitude at latitude to zipcodes, for each restaurant
import geopy

In [4]:
from arcgis.geocoding import reverse_geocode
from arcgis.geometry import Geometry
from arcgis.gis import GIS

In [6]:
# reading my csv into a pandas dataframe
restaurant_df = pd.read_csv('../Resources/raw.csv')
restaurant_df = pd.DataFrame(restaurant_df)

In [7]:
# Initial review of csv to dataframe output
restaurant_df.head()

Unnamed: 0,restaurant,address,city,state,country,latitude,longitude,tel,website,price,rating
0,Chipotle Mexican Grill,126 New,San Francisco,'region': 'CA',US,'latitude': 37.787086,'longitude': -122.400212,'tel': '(415) 512-8113',website':,'price': '1','rating': 3.0
1,In-N-Out Burger,333 Jefferson St,San Francisco,'region': 'CA',US,'latitude': 37.8078,'longitude': -122.418383,'tel': '(800) 786-1000',website','price': '3','rating': 4.0
2,Pearls Deluxe Burgers,708 Post St,San Francisco,'region': 'CA',US,'latitude': 37.787739,'longitude': -122.413641,'tel': '(415) 409-6120','email':,'price': '3','rating': 4.0
3,Cafe Du Soleil,200 Fillmore St,San Francisco,'region': 'CA',US,'latitude': 37.771303,'longitude': -122.430158,'tel': '(415) 934-8637','email':,'price': '3','rating': 4.5
4,Cafe Zazo,64 14th St,San Francisco,'region': 'CA',US,'latitude': 37.768642,'longitude': -122.414821,'tel': '(415) 626-5555','email':,'price': '1','rating': 4.0


In [8]:
# Checking the shape of the dataframe
shape = restaurant_df.shape
print (f'The dataframe shape is: {shape}')

The dataframe shape is: (130, 11)


In [9]:
# ISSUE 1: Fixing the rows, which have there respective column headers contaminating the expected values.

# I firstly created a list of those column headers that were impacted, to reference later.
cols = ['state', 'latitude', 'longitude', 'tel', 'price', 'rating']

# A for loop to iterate through my cols list, containing the impacted column headers.
# For every impacted column header, I split on ': ', with expand=True.
# I then took element [1] and replaced the extra ' from around the strings. I then used the replace function to replace to replace the string value.

for col in cols:
    restaurant_df[col] = restaurant_df[col].str.split(': ', expand=True)[1].str.replace("'", '')


In [247]:
restaurant_df.head()

Unnamed: 0,restaurant,address,city,state,country,latitude,longitude,tel,website,price,rating
0,Chipotle Mexican Grill,126 New,San Francisco,CA,US,37.787086,-122.400212,(415) 512-8113,website':,1,3.0
1,In-N-Out Burger,333 Jefferson St,San Francisco,CA,US,37.8078,-122.418383,(800) 786-1000,website',3,4.0
2,Pearls Deluxe Burgers,708 Post St,San Francisco,CA,US,37.787739,-122.413641,(415) 409-6120,'email':,3,4.0
3,Cafe Du Soleil,200 Fillmore St,San Francisco,CA,US,37.771303,-122.430158,(415) 934-8637,'email':,3,4.5
4,Cafe Zazo,64 14th St,San Francisco,CA,US,37.768642,-122.414821,(415) 626-5555,'email':,1,4.0


In [10]:
# ISSUE 2: As the 'website' column values look to simply be the website column header name and the column header of another (presumebly missig column) header named 'email',
# I will drop this column, as it serves no purpose.

restaurant_df = restaurant_df.drop(['website', 'country'], axis=1)

In [249]:
# The team decided that whilst all 4 datasets (one per team member) will be saved as seperate collections in a non-relational database, all the collections homogeneity
# would be a zipcode column.

In [11]:
# Making a new dataframe, with just the longitude and latitude columns, for refernce in my function, that collects zipcodes
coordinates_df = restaurant_df[['latitude', 'longitude']].copy()

In [12]:
# sanity check on nearly created dataframe, containing coordinates of each restaurant
coordinates_df.head()

Unnamed: 0,latitude,longitude
0,37.787086,-122.400212
1,37.8078,-122.418383
2,37.787739,-122.413641
3,37.771303,-122.430158
4,37.768642,-122.414821


In [13]:
# My dataframe does not contain the zipcode for each restaurant, however by using the long and lat column values, in collaboration 'geopy', I was able to append each restaurant's
# zipcode to the dataframe

# 
gis = GIS("http://www.arcgis.com", "wtcforsyth", "!oV.wnnVpeReETN3UykYsggaMhnpzq1")

def get_zip(coordinates_df, lon_field, lat_field):
    location = reverse_geocode((Geometry({"x":float(coordinates_df[lon_field]), "y":float(coordinates_df[lat_field]), "spatialReference":{"wkid": 4326}})))
    return location['address']['Postal']

# storing returned values in an object
zipcodes = coordinates_df.apply(get_zip, axis=1, lat_field='latitude', lon_field='longitude')


In [14]:
# sanity check on object, containing zipcodes
zipcodes.head()

0    94105
1    94133
2    94109
3    94117
4    94103
dtype: object

In [17]:
zipcodes_df = pd.DataFrame(zipcodes, columns=['zipcode'])
zipcodes_df

Unnamed: 0,zipcode
0,94105
1,94133
2,94109
3,94117
4,94103
...,...
125,94102
126,94102
127,94111
128,94109


In [18]:
restaurant_df = restaurant_df.merge(zipcodes_df.zipcode, left_index=True, right_index=True)
restaurant_df.head()

Unnamed: 0,restaurant,address,city,state,latitude,longitude,tel,price,rating,zipcode
0,Chipotle Mexican Grill,126 New,San Francisco,CA,37.787086,-122.400212,(415) 512-8113,1,3.0,94105
1,In-N-Out Burger,333 Jefferson St,San Francisco,CA,37.8078,-122.418383,(800) 786-1000,3,4.0,94133
2,Pearls Deluxe Burgers,708 Post St,San Francisco,CA,37.787739,-122.413641,(415) 409-6120,3,4.0,94109
3,Cafe Du Soleil,200 Fillmore St,San Francisco,CA,37.771303,-122.430158,(415) 934-8637,3,4.5,94117
4,Cafe Zazo,64 14th St,San Francisco,CA,37.768642,-122.414821,(415) 626-5555,1,4.0,94103


In [19]:
column_names = ['restaurant', 'address', 'city', 'state', 'zipcode', 'latitude', 'longitude', 'tel', 'price', 'rating']
restaurant_df = restaurant_df.reindex(columns=column_names)

In [20]:
restaurant_df.head()

Unnamed: 0,restaurant,address,city,state,zipcode,latitude,longitude,tel,price,rating
0,Chipotle Mexican Grill,126 New,San Francisco,CA,94105,37.787086,-122.400212,(415) 512-8113,1,3.0
1,In-N-Out Burger,333 Jefferson St,San Francisco,CA,94133,37.8078,-122.418383,(800) 786-1000,3,4.0
2,Pearls Deluxe Burgers,708 Post St,San Francisco,CA,94109,37.787739,-122.413641,(415) 409-6120,3,4.0
3,Cafe Du Soleil,200 Fillmore St,San Francisco,CA,94117,37.771303,-122.430158,(415) 934-8637,3,4.5
4,Cafe Zazo,64 14th St,San Francisco,CA,94103,37.768642,-122.414821,(415) 626-5555,1,4.0


In [21]:
# Converting to JSON file
restaurant_df.to_json('will_data.json')