In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

functions

In [2]:
import re

#Basic String Cleaning:
def clean_str(text):
    search  = ["آ","إ","أ","ة","ى","  "]
    replace = ["ا","ا","ا","ه","ي"," "]
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')

    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
        
    # Remove unwanted symbols, keeping / and % and.
    text = re.sub(r'[^\w\s/٪.]', '', text) 
    
    return text.strip() 

import unicodedata

def remove_accents(text):
    if isinstance(text, str):
        return ''.join(
            c for c in unicodedata.normalize('NFKD', text)
            if not unicodedata.combining(c)
        )
    return text


def normalize_arabic(text):
    if not isinstance(text, str):
        return text
    text = re.sub(r'[\u064B-\u0652\u0640\u200f\u200e\u00A0]', '', text)  # remove diacritics, tatweel, NBSP
    return text.strip()

In [3]:
cafes = pd.read_csv('riyadh_cafes.csv')
cafes = cafes.drop('index', axis=1)
cafes = cafes.replace("Null", np.nan)
cafes = cafes.replace("null", np.nan)
cafes = cafes.rename(columns={
    'lan': 'latitude',
    'lon': 'longitude'})
print(cafes.shape)
cafes.head()


(2609, 7)


Unnamed: 0,coffeeName,rating,rating_count,url,24_hours,longitude,latitude
0,Cacti Cafe,4.2,2212,https://www.google.com/maps/place/Cacti+Cafe/d...,True,46.7356133,24.8339855
1,فازا قهوة مختصة,4.3,889,https://www.google.com/maps/place/%D9%81%D8%A7...,False,46.6973369,24.7724577
2,ناريز,3.8,39,https://www.google.com/maps/place/%D9%86%D8%A7...,False,46.8521092,24.8072766
3,Fc Lounge - اف سي لاونج,3.5,539,https://www.google.com/maps/place/Fc+Lounge+-+...,True,46.7680585,24.8131149
4,PEAKS,4.6,25,https://www.google.com/maps/place/PEAKS/data=!...,False,46.6348092,24.742045


Cleaning the coffeeName column

lowercase, strip, remove extra spaces

In [4]:
cafes['coffeeName']  = cafes['coffeeName'].str.lower().str.strip().str.replace(r'\s+', ' ', regex=True).apply(clean_str).apply(remove_accents).apply(normalize_arabic)

In [5]:
cafes[cafes.coffeeName.str.contains("mammabunz")].coffeeName.unique().tolist()

['ماما بنز كافيه  mammabunz cafe', 'mammabunz cafe']

In [6]:
cafes[cafes.coffeeName.str.contains(r"كيان")].coffeeName.unique().tolist()

['كيان كافيه',
 'كيان kyan',
 'كيان  kyan',
 'kyanكيان',
 'كيان كافيهkyan cafa',
 'كنافه كيان',
 'مقهي كيان الافراح',
 'kyan  كيان']

In [7]:
cafes[cafes.coffeeName.str.contains(r"kyan")].coffeeName.unique().tolist()

['kyan',
 'كيان kyan',
 'kyan cafe',
 'كيان  kyan',
 'kyanكيان',
 'كيان كافيهkyan cafa',
 'kyan  كيان',
 'kyan urija branch']

In [16]:
cafes[cafes['coffeeName']=='مقهي كيان الافراح']['url'].iloc[0]

'https://www.google.com/maps/place/%D9%85%D9%82%D9%87%D9%89+%D9%83%D9%8A%D8%A7%D9%86+%D8%A7%D9%84%D8%A7%D9%81%D8%B1%D8%A7%D8%AD%E2%80%AD/data=!4m7!3m6!1s0x3e2f19e71dd342c7:0x58bf808819464d88!8m2!3d24.5734954!4d46.575467!16s%2Fg%2F11rms7ws8r!19sChIJx0LTHecZLz4RiE1GGYiAv1g?authuser=0&hl=en&rclk=1'

Some data cleaning for first few coffee places

In [9]:
arabic_starbucks_variants = [
    'ستار بكس',
    'ستار بوكس',
    'كافي استار بوكس',
    'استار بوكس',
    'استاربوكس'
]
mask_star = cafes['coffeeName'].str.contains('starbucks', case=False, na=False) & ~cafes['coffeeName'].str.contains('reserve', case=False, na=False)
cafes.loc[mask_star, 'coffeeName'] = 'starbucks'
cafes.loc[cafes['coffeeName'].isin(arabic_starbucks_variants), 'coffeeName'] = 'starbucks'

mask_mccafe = (
    cafes['coffeeName'].str.contains(r'\bmccafe\b', case=False, na=False) |
    cafes['coffeeName'].str.contains('ماك كافيه', na=False)
)
cafes.loc[mask_mccafe, 'coffeeName'] = 'mccafe'


mask = cafes['coffeeName'].str.contains(r'(address|عنوان)', case=False, na=False)
cafes.loc[mask, 'coffeeName'] = 'coffee address'

mask_dunkin = cafes['coffeeName'].str.contains(r'dunkin|دانكن', case=False, na=False)
cafes.loc[mask_dunkin, 'coffeeName'] = 'dunkin donuts'

mask_barns = cafes['coffeeName'].str.contains(r'\bbarns\b|بارنز', case=False, na=False)
cafes.loc[mask_barns, 'coffeeName'] = 'barns'

mask_drcafe = cafes['coffeeName'].str.contains(r'dr\.?\s*cafe|د\.?كيف', case=False, na=False)
cafes.loc[mask_drcafe, 'coffeeName'] = 'dr.cafe coffee'


mask_java_cafe = (
    cafes['coffeeName'].str.contains(r'java\s*cafe|جافا\s*كافيه', case=False, na=False) &
    ~cafes['coffeeName'].str.contains(r'java\s*time|جافا\s*تايم', case=False, na=False)
)

cafes.loc[mask_java_cafe, 'coffeeName'] = 'java cafe'

mask_java_time = cafes['coffeeName'].str.contains(r'java\s*time|جافا\s*تايم', case=False, na=False)

cafes.loc[mask_java_time, 'coffeeName'] = 'java time'

mask_dan = cafes['coffeeName'].str.contains(r'\bdan\s*cafe\b|دان\s*كافيه', case=False, na=False)
cafes.loc[mask_dan, 'coffeeName'] = 'dan cafe'
mask_dana = cafes['coffeeName'].str.contains(r'\bdana\s+(cafe|coffee)\b|دانه\s*كافيه', case=False, na=False)
cafes.loc[mask_dana, 'coffeeName'] = 'dana cafe'

mask_mammabunz = cafes['coffeeName'].str.contains(r'mammabunz|ماما\s*بنز', case=False, na=False)
cafes.loc[mask_mammabunz, 'coffeeName'] = 'mammabunz cafe'


  mask = cafes['coffeeName'].str.contains(r'(address|عنوان)', case=False, na=False)
  mask_dana = cafes['coffeeName'].str.contains(r'\bdana\s+(cafe|coffee)\b|دانه\s*كافيه', case=False, na=False)


removing cafe or coffee only if its at the end as a standalone word

In [11]:
cafes['coffeeName'] = cafes['coffeeName'].str.replace(r'\b(coffee|cafe|كافيه)$', '', regex=True).str.strip()
cafes['coffeeName'] = cafes['coffeeName'].str.strip()

more data cleaning

In [12]:
cafes.coffeeName.value_counts()

coffeeName
dunkin donuts                 54
dr.cafe                       53
barns                         51
mccafe                        47
starbucks                     44
                              ..
ريشيو  ratio                   1
ديوانيه ومقهي الوتين           1
quill                          1
شاي السرور                     1
مقهي سوداني maqaa sudaniun     1
Name: count, Length: 1936, dtype: int64

missing coordinates cleaning

In [67]:
missing_coords = cafes[cafes['longitude'].isnull() | cafes['latitude'].isnull()]
missing_coords

Unnamed: 0,coffeeName,rating,rating_count,url,24_hours,longitude,latitude
189,..,3.3,4,https://www.google.com/maps/data=!4m7!3m6!1s0x3e2f05a86b5bc4ef:0x15d0e99b727a01ba!8m2!3d24.6448201!4d46.7259579!16s%2Fg%2F11t3_cm302!19sChIJ78Rba6gFLz4RugF6cpvp0BU?authuser=0&hl=en&rclk=1,False,,
363,استراحة الامير,4.6,8,https://www.google.com/maps/place/%D8%A7%D8%B3%D8%AA%D8%B1%D8%A7%D8%AD%D8%A9+%D8%A7%D9%84%D8%A7%D9%85%D9%8A%D8%B1%E2%80%AD/data=!4m7!3m6!1s0x3e2effb3ee940f6b:0x9d98c2d3272180d!8m2!3d24.8144107!4d46.7653716!16s%2Fg%2F11h3wybjw5!19sChIJaw-U7rP_Lj4RDRhyMi2M2Qk?authuser=0&hl=en&rclk=1,False,,
427,Veloce Cafe,4.5,22,https://www.google.com/maps/place/Veloce+Cafe/data=!4m7!3m6!1s0x3e2efdf26aa8c68b:0x9b8711b1e87fa144!8m2!3d24.7622419!4d46.7227265!16s%2Fg%2F11t2f4_t24!19sChIJi8aoavL9Lj4RRKF_6LERh5s?authuser=0&hl=en&rclk=1,False,,
1452,الايمان,4.1,16,https://www.google.com/maps/place/%D8%A7%D9%84%D8%A7%D9%8A%D9%85%D8%A7%D9%86%E2%80%AD/data=!4m7!3m6!1s0x3e2f117dc77cdca5:0xe3a38d2147aa4b06!8m2!3d24.566332!4d46.6676809!16s%2Fg%2F11gtsw4d2j!19sChIJpdx8x30RLz4RBkuqRyGNo-M?authuser=0&hl=en&rclk=1,False,,


In [51]:
print(missing_coords[['coffeeName', 'url']])
pd.set_option('display.max_colwidth', None)


          coffeeName  \
189               ..   
363   استراحة الامير   
427      Veloce Cafe   
1452         الايمان   

                                                                                                                                                                                                                                                                                            url  
189                                                                                                 https://www.google.com/maps/data=!4m7!3m6!1s0x3e2f05a86b5bc4ef:0x15d0e99b727a01ba!8m2!3d24.6448201!4d46.7259579!16s%2Fg%2F11t3_cm302!19sChIJ78Rba6gFLz4RugF6cpvp0BU?authuser=0&hl=en&rclk=1  
363   https://www.google.com/maps/place/%D8%A7%D8%B3%D8%AA%D8%B1%D8%A7%D8%AD%D8%A9+%D8%A7%D9%84%D8%A7%D9%85%D9%8A%D8%B1%E2%80%AD/data=!4m7!3m6!1s0x3e2effb3ee940f6b:0x9d98c2d3272180d!8m2!3d24.8144107!4d46.7653716!16s%2Fg%2F11h3wybjw5!19sChIJaw-U7rP_Lj4RDRhyMi2M2Qk?authuser=0&hl=en&rclk=1  
427      

Removing bad data --> .. is not a place and url doesnt work

In [84]:
cafes = cafes[cafes['coffeeName'] != '..']
cafes.head()

Unnamed: 0,coffeeName,rating,rating_count,url,24_hours,longitude,latitude
0,Cacti Cafe,4.2,2212,https://www.google.com/maps/place/Cacti+Cafe/data=!4m7!3m6!1s0x3e2efde0d2059f1d:0xfca400b51ca140cc!8m2!3d24.8339855!4d46.7356133!16s%2Fg%2F11mwzvn0zx!19sChIJHZ8F0uD9Lj4RzEChHLUApPw?authuser=0&hl=en&rclk=1,True,46.7356133,24.8339855
1,فازا قهوة مختصة,4.3,889,https://www.google.com/maps/place/%D9%81%D8%A7%D8%B2%D8%A7+%D9%82%D9%87%D9%88%D8%A9+%D9%85%D8%AE%D8%AA%D8%B5%D8%A9%E2%80%AD/data=!4m7!3m6!1s0x3e2efdc6c840b3ed:0x792ad260e10b7e!8m2!3d24.7724577!4d46.6973369!16s%2Fg%2F11mwkdclc9!19sChIJ7bNAyMb9Lj4RfgvhYNIqeQA?authuser=0&hl=en&rclk=1,False,46.6973369,24.7724577
2,ناريز,3.8,39,https://www.google.com/maps/place/%D9%86%D8%A7%D8%B1%D9%8A%D8%B2%E2%80%AD/data=!4m7!3m6!1s0x3e2faacd291e6591:0x3e908558d2f0f63f!8m2!3d24.8072766!4d46.8521092!16s%2Fg%2F11c5_5k_zz!19sChIJkWUeKc2qLz4RP_bw0liFkD4?authuser=0&hl=en&rclk=1,False,46.8521092,24.8072766
3,Fc Lounge - اف سي لاونج,3.5,539,https://www.google.com/maps/place/Fc+Lounge+-+%D8%A7%D9%81+%D8%B3%D9%8A+%D9%84%D8%A7%D9%88%D9%86%D8%AC%E2%80%AD/data=!4m7!3m6!1s0x3e2eff79d7dd770d:0xbd5029a7e12e0c2b!8m2!3d24.8131149!4d46.7680585!16s%2Fg%2F11lgkw1ffk!19sChIJDXfd13n_Lj4RKwwu4acpUL0?authuser=0&hl=en&rclk=1,True,46.7680585,24.8131149
4,PEAKS,4.6,25,https://www.google.com/maps/place/PEAKS/data=!4m7!3m6!1s0x3e2ee380585f0151:0xab784cd32a1e3d85!8m2!3d24.742045!4d46.6348092!16s%2Fg%2F11s0qh89n0!19sChIJUQFfWIDjLj4RhT0eKtNMeKs?authuser=0&hl=en&rclk=1,False,46.6348092,24.742045


clicking url in google maps, filling in correct coordinates for null values

In [90]:
cafes.loc[363 ,['longitude','latitude']] =  46.76535014232869, 24.814398561155688
cafes.loc[427 ,['longitude','latitude']] =  46.7227265, 24.762256513537807 
cafes.loc[1452 ,['longitude','latitude']] = 46.66768090000001, 24.566332000000113

In [48]:
cafes_gpd = gpd.geopandas(cafes, geometry=gpd.points_from_xy(
        cafes['lon'],
          cafes['lan']
          ))

TypeError: 'module' object is not callable