# Importing and preparing restaurant data

## Libraries and settings

In [14]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/workspaces/data_analytics/1_AT_Scraping


## Importing data

In [15]:
# Show .csv - files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv('restaurant_data_zuerich.csv', sep=',', encoding='utf-8')[['web-scraper-order', 
                                                                            'restaurant_name',
                                                                            'total_stars',
                                                                            'number_of_reviews',
                                                                            'cuisine_type',
                                                                            'price_class',
                                                                            'raw_description',
                                                                            'opening_times',
                                                                            'adress',
                                                                            'district',
                                                                            'long_lat',	
                                                                            'all_amenities']]

# Show first records of data frame
df.head()

restaurant_data_zuerich.csv
restaurant_data_zuerich_prepared.csv


Unnamed: 0,web-scraper-order,restaurant_name,total_stars,number_of_reviews,cuisine_type,price_class,raw_description,opening_times,adress,district,long_lat,all_amenities
0,1705095756-1,White Elephant,4.3,(30 reviews),Thailändisch,$$$,The White Elephant restaurant has the reputati...,"12:00 PM - 2:00 PM, 6:00 PM - 10:00 PM",Neumühlequai 42 8006 Zürich,Kreis 6,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichBestellung zum AbholenKei...
1,1705095762-2,Bamboo Inn,4.0,(2 reviews),Chinesisch,,,"11:30 AM - 2:00 PM, 6:00 PM - 10:00 PM",Culmannstrasse 19 8006 Zürich,Kreis 6,https://maps.googleapis.com/maps/api/staticmap...,Lieferung möglichBestellung zum AbholenFür Kin...
2,1705095768-3,Restaurant Luca²,4.7,(6 reviews),Mediterran,$$$,"Mediterrane Küche, besondere Weine","11:30 AM - 2:00 PM, 6:30 PM - 11:45 PM",Asylstrasse 81 8032 Zürich,Kreis 7,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKein LieferserviceKeine M...
3,1705095776-4,L'Altro,5.0,(7 reviews),Italienisch,$$$,,"11:30 AM - 2:30 PM, 5:30 PM - 12:00 AM (Folgetag)",Sternenstrasse 11 8002 Zürich,Kreis 2,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKeine MitnahmeSitzplätze ...
4,1705095781-5,ROSI,4.0,(2 reviews),Bayerische Küche,,,6:00 PM - 12:00 AM (Folgetag),Sihlfeldstrasse 89 8004 Zürich,Kreis 4,https://maps.googleapis.com/maps/api/staticmap...,


## Count number of rows and columns in the data frame

In [16]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (235, 12)
Number of rows: 235
Number of columns: 12


### (Regex) 
#### Extract longtitude & latitude of restaurants, 
#### Number of reviews to int
#### District to int 


In [17]:
#lat

lat = []
for i in df['long_lat']:
    d1 = re.findall('center=([-\d.]+)%2C[-\d.]+', i)
    try:
        d2 = d1[0].strip()
    except IndexError:
        d2 = None
    lat.append(d2)

# Save as new variable in the pandas data frame
df['lat'] = pd.Series(lat, dtype="float")

# Print first 5 values
print(df['long_lat'].head(5), '\n')
print(df['lat'].head(5))

#long

long = []
for i in df['long_lat']:
    d1 = re.findall(r'center=[-\d.]+%2C([-\d.]+)', i)
    try:
        d2 = d1[0].strip()
    except IndexError:
        d2 = None
    long.append(d2)

# Save as new variable in the pandas data frame
df['long'] = pd.Series(long, dtype="float")

df['number_of_reviews'] = df['number_of_reviews'].str.extract(r'(\d+)').astype(int)
df['district'] = df['district'].str.extract(r'(\d+)')


# Print first 5 values
print(df['long_lat'].head(5), '\n')
print(df['long'].head(5))

df.head(5)


0    https://maps.googleapis.com/maps/api/staticmap...
1    https://maps.googleapis.com/maps/api/staticmap...
2    https://maps.googleapis.com/maps/api/staticmap...
3    https://maps.googleapis.com/maps/api/staticmap...
4    https://maps.googleapis.com/maps/api/staticmap...
Name: long_lat, dtype: object 

0    47.382439
1    47.380480
2    47.366809
3    47.360064
4    47.376481
Name: lat, dtype: float64
0    https://maps.googleapis.com/maps/api/staticmap...
1    https://maps.googleapis.com/maps/api/staticmap...
2    https://maps.googleapis.com/maps/api/staticmap...
3    https://maps.googleapis.com/maps/api/staticmap...
4    https://maps.googleapis.com/maps/api/staticmap...
Name: long_lat, dtype: object 

0    8.540643
1    8.547261
2    8.561675
3    8.534062
4    8.516238
Name: long, dtype: float64


Unnamed: 0,web-scraper-order,restaurant_name,total_stars,number_of_reviews,cuisine_type,price_class,raw_description,opening_times,adress,district,long_lat,all_amenities,lat,long
0,1705095756-1,White Elephant,4.3,30,Thailändisch,$$$,The White Elephant restaurant has the reputati...,"12:00 PM - 2:00 PM, 6:00 PM - 10:00 PM",Neumühlequai 42 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichBestellung zum AbholenKei...,47.382439,8.540643
1,1705095762-2,Bamboo Inn,4.0,2,Chinesisch,,,"11:30 AM - 2:00 PM, 6:00 PM - 10:00 PM",Culmannstrasse 19 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Lieferung möglichBestellung zum AbholenFür Kin...,47.38048,8.547261
2,1705095768-3,Restaurant Luca²,4.7,6,Mediterran,$$$,"Mediterrane Küche, besondere Weine","11:30 AM - 2:00 PM, 6:30 PM - 11:45 PM",Asylstrasse 81 8032 Zürich,7,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKein LieferserviceKeine M...,47.366809,8.561675
3,1705095776-4,L'Altro,5.0,7,Italienisch,$$$,,"11:30 AM - 2:30 PM, 5:30 PM - 12:00 AM (Folgetag)",Sternenstrasse 11 8002 Zürich,2,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKeine MitnahmeSitzplätze ...,47.360064,8.534062
4,1705095781-5,ROSI,4.0,2,Bayerische Küche,,,6:00 PM - 12:00 AM (Folgetag),Sihlfeldstrasse 89 8004 Zürich,4,https://maps.googleapis.com/maps/api/staticmap...,,47.376481,8.516238


### Convert the amenities in to a numeric value ( yes = 1, no = 0, NaN = information missing)

In [18]:
# All current amenities are as a single string in the csv file
common_amenities = [
    ("Reservationen möglich", "Keine Reservationen"),
    ("Lieferung möglich", "Kein Lieferservice"),
    ("Sitzplätze im Freien", "Keine Sitzplätze im Freien"),
    ("Für Kinder geeignet", "Nicht für Kinder geeignet"),
    ("Für Gruppen geeignet", "Nicht für Gruppen geeignet"),
    ("TV", "Kein TV")
]

def check_amenities_presence(amenities, amenity_pairs):
    if not isinstance(amenities, (list, str)): 
        return {} 
    results = {}
    for positive, negative in amenity_pairs:
        if positive in amenities:
            results[positive] = 1
        elif negative in amenities:
            results[positive] = 0
        else:
            results[positive] = None
    return results

# Applying the function to the dataset and updating the DataFrame
for amenity_pair in common_amenities:
    positive_amenity = amenity_pair[0]
    # Create a new column for each positive amenity
    df[positive_amenity] = df['all_amenities'].apply(lambda x: check_amenities_presence(x, [amenity_pair]).get(positive_amenity))


#rename columns

df.rename(columns={'Reservationen möglich': 'takes_reservations', 
                   'Lieferung möglich': 'delivery', 
                   'Sitzplätze im Freien': 'outdoor_seating', 
                   'Für Kinder geeignet': 'good_for_kids', 
                   'Für Gruppen geeignet':'good_for_groups', 
                   'TV': 'tv', 
                   'adress':'address'}, 
                   inplace=True)

# Displaying the first few rows of the updated DataFrame
df.head()


Unnamed: 0,web-scraper-order,restaurant_name,total_stars,number_of_reviews,cuisine_type,price_class,raw_description,opening_times,address,district,long_lat,all_amenities,lat,long,takes_reservations,delivery,outdoor_seating,good_for_kids,good_for_groups,tv
0,1705095756-1,White Elephant,4.3,30,Thailändisch,$$$,The White Elephant restaurant has the reputati...,"12:00 PM - 2:00 PM, 6:00 PM - 10:00 PM",Neumühlequai 42 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichBestellung zum AbholenKei...,47.382439,8.540643,1.0,0.0,1.0,1.0,1.0,1.0
1,1705095762-2,Bamboo Inn,4.0,2,Chinesisch,,,"11:30 AM - 2:00 PM, 6:00 PM - 10:00 PM",Culmannstrasse 19 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Lieferung möglichBestellung zum AbholenFür Kin...,47.38048,8.547261,,1.0,,1.0,,
2,1705095768-3,Restaurant Luca²,4.7,6,Mediterran,$$$,"Mediterrane Küche, besondere Weine","11:30 AM - 2:00 PM, 6:30 PM - 11:45 PM",Asylstrasse 81 8032 Zürich,7,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKein LieferserviceKeine M...,47.366809,8.561675,1.0,0.0,1.0,0.0,1.0,1.0
3,1705095776-4,L'Altro,5.0,7,Italienisch,$$$,,"11:30 AM - 2:30 PM, 5:30 PM - 12:00 AM (Folgetag)",Sternenstrasse 11 8002 Zürich,2,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKeine MitnahmeSitzplätze ...,47.360064,8.534062,1.0,,1.0,1.0,1.0,
4,1705095781-5,ROSI,4.0,2,Bayerische Küche,,,6:00 PM - 12:00 AM (Folgetag),Sihlfeldstrasse 89 8004 Zürich,4,https://maps.googleapis.com/maps/api/staticmap...,,47.376481,8.516238,,,,,,


### Convert the price class in to a numeric value ( $ = 1)

In [19]:
def convert_dollar_to_number(dollar_str):
    if pd.isna(dollar_str):
        return None
    return len(dollar_str)

# Applying the conversion function to the 'price_class' column
df['price_class'] = df['price_class'].apply(convert_dollar_to_number)

df.head(5)


Unnamed: 0,web-scraper-order,restaurant_name,total_stars,number_of_reviews,cuisine_type,price_class,raw_description,opening_times,address,district,long_lat,all_amenities,lat,long,takes_reservations,delivery,outdoor_seating,good_for_kids,good_for_groups,tv
0,1705095756-1,White Elephant,4.3,30,Thailändisch,3.0,The White Elephant restaurant has the reputati...,"12:00 PM - 2:00 PM, 6:00 PM - 10:00 PM",Neumühlequai 42 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichBestellung zum AbholenKei...,47.382439,8.540643,1.0,0.0,1.0,1.0,1.0,1.0
1,1705095762-2,Bamboo Inn,4.0,2,Chinesisch,,,"11:30 AM - 2:00 PM, 6:00 PM - 10:00 PM",Culmannstrasse 19 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Lieferung möglichBestellung zum AbholenFür Kin...,47.38048,8.547261,,1.0,,1.0,,
2,1705095768-3,Restaurant Luca²,4.7,6,Mediterran,3.0,"Mediterrane Küche, besondere Weine","11:30 AM - 2:00 PM, 6:30 PM - 11:45 PM",Asylstrasse 81 8032 Zürich,7,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKein LieferserviceKeine M...,47.366809,8.561675,1.0,0.0,1.0,0.0,1.0,1.0
3,1705095776-4,L'Altro,5.0,7,Italienisch,3.0,,"11:30 AM - 2:30 PM, 5:30 PM - 12:00 AM (Folgetag)",Sternenstrasse 11 8002 Zürich,2,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKeine MitnahmeSitzplätze ...,47.360064,8.534062,1.0,,1.0,1.0,1.0,
4,1705095781-5,ROSI,4.0,2,Bayerische Küche,,,6:00 PM - 12:00 AM (Folgetag),Sihlfeldstrasse 89 8004 Zürich,4,https://maps.googleapis.com/maps/api/staticmap...,,47.376481,8.516238,,,,,,


## Count and identify missing values (if any)

In [20]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

web-scraper-order       0
restaurant_name         0
total_stars             0
number_of_reviews       0
cuisine_type            0
price_class            72
raw_description       176
opening_times          25
address                 5
district               13
long_lat                0
all_amenities          28
lat                     0
long                    0
takes_reservations     84
delivery               79
outdoor_seating        72
good_for_kids          73
good_for_groups        68
tv                     81
dtype: int64


Unnamed: 0,web-scraper-order,restaurant_name,total_stars,number_of_reviews,cuisine_type,price_class,raw_description,opening_times,address,district,long_lat,all_amenities,lat,long,takes_reservations,delivery,outdoor_seating,good_for_kids,good_for_groups,tv
1,1705095762-2,Bamboo Inn,4.0,2,Chinesisch,,,"11:30 AM - 2:00 PM, 6:00 PM - 10:00 PM",Culmannstrasse 19 8006 Zürich,6,https://maps.googleapis.com/maps/api/staticmap...,Lieferung möglichBestellung zum AbholenFür Kin...,47.38048,8.547261,,1.0,,1.0,,
3,1705095776-4,L'Altro,5.0,7,Italienisch,3.0,,"11:30 AM - 2:30 PM, 5:30 PM - 12:00 AM (Folgetag)",Sternenstrasse 11 8002 Zürich,2,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichKeine MitnahmeSitzplätze ...,47.360064,8.534062,1.0,,1.0,1.0,1.0,
4,1705095781-5,ROSI,4.0,2,Bayerische Küche,,,6:00 PM - 12:00 AM (Folgetag),Sihlfeldstrasse 89 8004 Zürich,4,https://maps.googleapis.com/maps/api/staticmap...,,47.376481,8.516238,,,,,,
5,1705095787-6,Ototo,4.3,7,Sushi,3.0,,5:30 PM - 11:00 PM,Nordstrasse 199 8037 Zürich,10,https://maps.googleapis.com/maps/api/staticmap...,Reservationen möglichLieferung möglichBestellu...,47.392527,8.530738,1.0,1.0,1.0,1.0,1.0,1.0
6,1705095793-7,Chimy’s,4.7,3,Vegetarisch,,,"11:30 AM - 2:00 PM, 6:00 PM - 11:00 PM",Neugasse 76 8005 Zürich,5,https://maps.googleapis.com/maps/api/staticmap...,TV,47.38329,8.528432,,,,,,1.0


## Remove the restaurants with no amenitie info

In [21]:
df = df.dropna(subset=['all_amenities'])

# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (207, 20)
Number of rows: 207
Number of columns: 20


## Count and remove the restaurants with 1 star ratings

In [22]:
one_star_count = df['total_stars'].value_counts().get(1.0, 0)
print("Number of 1-star ratings:", one_star_count)

df = df[df['total_stars'] != 1.0]

print('Number of rows:', df.shape[0])

Number of 1-star ratings: 1
Number of rows: 206


## Count and identify duplicated values (if any)

In [23]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'restaurant_name', 'address']].duplicated()]

0


Unnamed: 0,web-scraper-order,restaurant_name,total_stars,number_of_reviews,cuisine_type,price_class,raw_description,opening_times,address,district,long_lat,all_amenities,lat,long,takes_reservations,delivery,outdoor_seating,good_for_kids,good_for_groups,tv


### Save data to file

In [24]:
df.to_csv('restaurant_data_zuerich_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)