### Web scrapping for [maroof Website]

In [2]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

### get store IDs

In [5]:
# import requests
# import pandas as pd
# from concurrent.futures import ThreadPoolExecutor, as_completed

# API URLs
api_urls = []

for i in range(0, 66001, 500):
    api_urls.append("https://api.thiqah.sa/maroof/public/api/app/business/search?keyword=&businessTypeId=&businessTypeSubCategoryId=&regionId=&cityId=&certificationType=&sortBy=2&sortDirection=2&sorting=&skipCount={}&maxResultCount=500".format(i))

# API key and headers
api_key = "c1qesecmag8GSbxTHGRjfnMFBzAH7UAN"
headers = {'apikey': api_key}

# Fetch store information from API
def fetch_store_info(api_url):
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        print("Done")
    else:
        print(f"Failed to retrieve data from {api_url}")
        return []
    
    data = response.json()
    store_info = []

    # Extracting information for stores with at least 50 reviews
    for item in data.get('items', []):
        if item.get('totalReviews', 0) >= 10:
            store_info.append({
                'Name': item.get('name'),
                'NameAr': item.get('nameAr'),
                'LocalizedName': item.get('localizedName'),
                'BusinessType': item.get('businessType', {}).get('name'),
                'OtherTypeName': item.get('otherTypeName'),
                'IsPopularBusiness': item.get('isPopularBusiness'),
                'TotalReviews': item.get('totalReviews'),
                'Rating': item.get('rating'),
                'Id': item.get('id'),
                'ActiveStatus': item.get('activeStatus'),
                'OwnerAllowStatus': item.get('ownerAllowStatus'),
                'CertificationStatus': item.get('certificationStatus')
            })

    return store_info

# Fetch store info in parallel
def fetch_store_info_parallel(api_urls):
    all_store_info = []
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(fetch_store_info, url): url for url in api_urls}
        for future in as_completed(future_to_url):
            try:
                result = future.result()
                if result:
                    all_store_info.extend(result)
            except Exception as exc:
                print(f"Exception occurred: {exc}")
    
    return all_store_info

# Fetch store information and put into dataframe
store_info = fetch_store_info_parallel(api_urls)

# Convert to DataFrame
stores_df = pd.DataFrame(store_info)

# Display the dataframe
stores_df


Done
Done
Done
Done
Done
Done


In [19]:
stores_df.shape

(66400, 12)

In [20]:
stores_df.to_csv('all_stores.csv', index=False)


In [5]:
df10 = stores_df[(stores_df['TotalReviews'] >=10)]

In [None]:
df10.to_csv('10_Review_Stores.csv')

### Get stores comments and reviews

In [6]:
df10['TotalReviews'].value_counts(ascending=False)

TotalReviews
11     107
10      76
12      62
13      58
15      47
      ... 
255      1
967      1
98       1
171      1
656      1
Name: count, Length: 167, dtype: int64

In [4]:
df10.shape

(66400, 12)

In [7]:

# Base URL for the API endpoint
api_url_template = "https://api.thiqah.sa/maroof/public/api/app/business/{store_id}/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false"

# API key
api_key = "c1qesecmag8GSbxTHGRjfnMFBzAH7UAN"
headers = {
    'apikey': f'{api_key}'
}

def get_store_reviews(store_id):
    print(store_id)
    api_url = api_url_template.format(store_id=store_id)
    print(api_url)
    
    response = requests.get(api_url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve data.")
        return None
    if response.status_code != 200:
        print("200")

    try:
        data = response.json()
    except ValueError as e:
        print(f"Error parsing JSON: {e}")
        return None
    
    reviews = []
    for item in data.get('items', []):
        review = {
            'BusinessId': store_id,
            'review_id': item.get('id'),
            'rating': item.get('rating'),
            'comment': item.get('comment'),
            'creation_date': item.get('creationDate')
        }
        reviews.append(review)
    
    return reviews

def fetch_reviews_parallel(store_ids):
    all_reviews = []
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_store_id = {executor.submit(get_store_reviews, store_id): store_id for store_id in store_ids}
        for future in as_completed(future_to_store_id):
            store_id = future_to_store_id[future]
            try:
                result = future.result()
                if result:
                    all_reviews.extend(result)
            except Exception as exc:
                print(f"{store_id} generated an exception: {exc}")
    return all_reviews


# Fetch reviews for each store in parallel
all_reviews = fetch_reviews_parallel(list(df10['Id'].values))

# Convert to DataFrame
df_reviews = pd.DataFrame(all_reviews)

# Print the DataFrame
print("DataFrame:")
# print(df)
df_reviews
# Optionally, save to a CSV file


113480
https://api.thiqah.sa/maroof/public/api/app/business/113480/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false
79729
https://api.thiqah.sa/maroof/public/api/app/business/79729/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false
60202
https://api.thiqah.sa/maroof/public/api/app/business/60202/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false
203165
https://api.thiqah.sa/maroof/public/api/app/business/203165/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false
91371
https://api.thiqah.sa/maroof/public/api/app/business/91371/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false
232851
https://api.thiqah.sa/maroof/public/api/app/business/232851/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false
105227
htt

Unnamed: 0,BusinessId,review_id,rating,comment,creation_date
0,60202,364189,1,كنت أتعامل معهم أسعارهم أرخص لكن آخر مرة حصلت ...,2024-05-11T05:56:29.327
1,60202,356203,1,عدم المصداقية في اسعار المنتجات,2024-03-06T08:08:24.19
2,60202,317781,1,المتجر سئ جدا في مصداقيته في بيانات البضاعة وا...,2023-02-18T15:41:20.32
3,60202,312616,1,,2023-01-18T10:17:30.503
4,60202,309154,1,,2023-01-01T03:54:11.487
...,...,...,...,...,...
62879,44461,61446,5,اكثر مايميز المتجر سرعة الرد والتوصيل السريع و...,2018-08-28T19:16:42.617
62880,44461,60746,5,التعامل مع متجر توليب تجربه تستحق اعادتها اكثر...,2018-07-31T02:51:11.543
62881,44461,60443,5,تجربتي معها جمييله وانسانه ذوق واخلاق ومتعاونه...,2018-07-22T16:43:45.897
62882,44461,60180,5,متجر توليب جميل جدا والتعامل اجمل ♥️,2018-07-16T08:09:20.407


In [8]:
df_reviews['BusinessId'].value_counts(ascending=False)

BusinessId
39976     4596
77092     3333
48246     1813
24519     1553
257509    1204
          ... 
95661       10
91678       10
86949       10
95180       10
60202       10
Name: count, Length: 1020, dtype: int64

In [9]:
df_reviews.to_csv('10_Reviews.csv')