# Load Testing Datasets Creation

- First of all we need to import the necessary libraries.

In [1]:
import itertools
import numpy as np
import pandas as pd
from faker import Faker
from geopy.geocoders import Nominatim

Faker.seed(0) # For reproducibility

- We will read the locations (unique city & state combinations for which we have businesses in the dataset).

In [2]:
locations_50k = pd.read_csv('./data/locations_50k.csv').locations
locations_100k = pd.read_csv('./data/locations_100k.csv').locations
locations = pd.read_csv('./data/locations_full.csv').locations

- We will create an instance of the `Nominatim` class from the `geopy` module, to geocode the locations.

In [3]:
geolocator = Nominatim(user_agent="radius_query_benchmarking", timeout=5)

- We will iterate over the locations and geocode each location to retrieve its latitude-longitude pair.
- This will take about ~10 minutes to complete because the `Nominatim` geocoder has a request limit, however, it is convenient because it does not require an API key like other geocoders.

In [4]:
center_lat_long_pairs_50k = []
center_lat_long_pairs_100k = []
center_lat_long_pairs = []
for location in locations:
    location_data = geolocator.geocode(location)

    if location_data is not None:
        center_latitude = location_data.latitude
        center_longitude = location_data.longitude
        center_lat_long_pairs.append((float(center_latitude), float(center_longitude)))
        
        if location in locations_50k.values:
            center_lat_long_pairs_50k.append((float(center_latitude), float(center_longitude)))
        
        if location in locations_100k.values:
            center_lat_long_pairs_100k.append((float(center_latitude), float(center_longitude)))
    else:
        print(f"Failed to geocode {location}")

Failed to geocode Nashville-Davidson metropolitan government (balance), TN
Failed to geocode Webster Grvs, MO
Failed to geocode Redingtn Shor, FL
Failed to geocode New Prt Rchy, FL
Failed to geocode Staint Albert, AB
Failed to geocode Indianpolis, IN
Failed to geocode TWN N CNTRY, FL
Failed to geocode Holland Southampton, PA
Failed to geocode Royford, PA
Failed to geocode Twn N Cntry, FL
Failed to geocode Hadden, NJ
Failed to geocode Mehville, MO
Failed to geocode West Deptford Townsh, NJ
Failed to geocode Mc Cordsville, IN
Failed to geocode Creve Couer, MO
Failed to geocode Mount Juliet, TX
Failed to geocode Pennsaulen, NJ
Failed to geocode Thonosassa, FL
Failed to geocode Glenoldan, PA
Failed to geocode Had Twp, NJ
Failed to geocode Westmont - Haddon Towsship, NJ
Failed to geocode Zephryhills, FL
Failed to geocode VC Highlands, NV
Failed to geocode St. Loius, MO
Failed to geocode Belleair Blf, FL
Failed to geocode Cherry Hil, NJ
Failed to geocode Claerwater, FL
Failed to geocode Phon

- We will create an instance of the `Faker` class from the `faker` module, to generate data around the centers of the geocoded locations.

In [5]:
f = Faker()

- We will generate 15.000 latitude-longitude pairs around the centers of the geocoded locations for each of the datasets, to be used in the load testing.

In [6]:
def generate_stores_load_test_data(center_lat_long_pairs, num_of_samples):
    latitudes = []
    longitudes = []
    max_distances = []
    for i, (lat, long) in enumerate(itertools.cycle(center_lat_long_pairs)):
        latitudes.append(f.coordinate(lat, radius=0.02))
        longitudes.append(f.coordinate(long, radius=0.02))
        max_distances.append(f.random_int(min=1, max=5))
        if i == num_of_samples:
            break

    return latitudes, longitudes, max_distances

In [7]:
latitudes_50k, longitudes_50k, max_distances_50k = \
    generate_stores_load_test_data(center_lat_long_pairs_50k, 15_000)

latitudes_100k, longitudes_100k, max_distances_100k = \
    generate_stores_load_test_data(center_lat_long_pairs_100k, 15_000)

latitudes, longitudes, max_distances = \
    generate_stores_load_test_data(center_lat_long_pairs, 15_000)

- We will create a load testing DataFrame for each of the datasets.

In [8]:
search_stores_params_df_50k = pd.DataFrame({
    'latitude': latitudes_50k,
    'longitude': longitudes_50k,
    'max_distance': max_distances_50k
})

In [9]:
search_stores_params_df_100k = pd.DataFrame({
    'latitude': latitudes_100k,
    'longitude': longitudes_100k,
    'max_distance': max_distances_100k
})

In [10]:
search_stores_params_df = pd.DataFrame({
    'latitude': latitudes,
    'longitude': longitudes,
    'max_distance': max_distances
})

- We will split the `search_stores_params_df_50k`, `search_stores_params_df_100k` & `search_stores_params_df` into 3 parts and we will save each of them as csv.

In [11]:
def stores_load_test_data_to_csv(df, dataset_size, vus=[50, 100, 200]):
    split_points = np.quantile(df.index, [1/3, 2/3])
    split_point_1, split_point_2 = round(split_points[0]), round(split_points[1])

    df[:split_point_1].to_csv(f'./datasets/search_stores_params_{dataset_size}_{vus[0]}vu.csv', index=False)
    df[split_point_1:split_point_2].to_csv(f'./datasets/search_stores_params_{dataset_size}_{vus[1]}vu.csv', index=False)
    df[split_point_2:].to_csv(f'./datasets/search_stores_params_{dataset_size}_{vus[2]}vu.csv', index=False)

In [12]:
stores_load_test_data_to_csv(search_stores_params_df_50k, 
                             dataset_size='50k')
stores_load_test_data_to_csv(search_stores_params_df_100k, 
                             dataset_size='100k')
stores_load_test_data_to_csv(search_stores_params_df, 
                             dataset_size='full')

- We will generate 6.000 latitude-longitude pairs around the centers of the geocoded locations for each of the datasets and data to filter the products by price and calories, to be used in the load testing.

In [13]:
def generate_products_load_test_data(center_lat_long_pairs, num_of_samples):
    latitudes = []
    longitudes = []
    max_distances = []
    min_prices = []
    max_prices = []
    min_calories = []
    max_calories = []
    for i, (lat, long) in enumerate(itertools.cycle(center_lat_long_pairs)):
        latitudes.append(f.coordinate(lat, radius=0.02))
        longitudes.append(f.coordinate(long, radius=0.02))
        max_distances.append(f.random_int(min=1, max=5))
        min_prices.append(f.random_int(min=0, max=3))
        max_prices.append(f.random_int(min=4, max=7))
        min_calories.append(f.randomize_nb_elements(number=150))
        max_calories.append(f.randomize_nb_elements(number=600))
        if i == num_of_samples:
            break

    return (latitudes, longitudes, max_distances, 
            min_prices, max_prices, min_calories, max_calories)

In [14]:
(latitudes_50k, longitudes_50k, max_distances_50k, 
 min_prices_50k, max_prices_50k, min_calories_50k, max_calories_50k) = \
    generate_products_load_test_data(center_lat_long_pairs_50k, 6000)

(latitudes_100k, longitudes_100k, max_distances_100k, 
 min_prices_100k, max_prices_100k, min_calories_100k, max_calories_100k) = \
    generate_products_load_test_data(center_lat_long_pairs_100k, 6000)

(latitudes, longitudes, max_distances, 
 min_prices, max_prices, min_calories, max_calories) = \
    generate_products_load_test_data(center_lat_long_pairs, 6000)

- We will create a a load testing DataFrame for each of the datasets​.

In [15]:
search_products_params_df_50k = pd.DataFrame({
    'latitude': latitudes_50k,
    'longitude': longitudes_50k,
    'max_distance': max_distances_50k,
    'min_price': min_prices_50k,
    'max_price': max_prices_50k,
    'min_calories': min_calories_50k,
    'max_calories': max_calories_50k
})

In [16]:
search_products_params_df_100k = pd.DataFrame({
    'latitude': latitudes_100k,
    'longitude': longitudes_100k,
    'max_distance': max_distances_100k,
    'min_price': min_prices_100k,
    'max_price': max_prices_100k,
    'min_calories': min_calories_100k,
    'max_calories': max_calories_100k
})

In [17]:
search_products_params_df = pd.DataFrame({
    'latitude': latitudes,
    'longitude': longitudes,
    'max_distance': max_distances,
    'min_price': min_prices,
    'max_price': max_prices,
    'min_calories': min_calories,
    'max_calories': max_calories
})

- We will split the `search_products_params_df_50k`, `search_products_params_df_100k` & `search_products_params_df` into 3 parts and we will save each of them as csv.

In [18]:
def products_load_test_data_to_csv(df, dataset_size, vus=[25, 50, 100]):
    split_points = np.quantile(df.index, [1/3, 2/3])
    split_point_1, split_point_2 = round(split_points[0]), round(split_points[1])

    df[:split_point_1].to_csv(f'./datasets/search_products_params_{dataset_size}_{vus[0]}vu.csv', index=False)
    df[split_point_1:split_point_2].to_csv(f'./datasets/search_products_params_{dataset_size}_{vus[1]}vu.csv', index=False)
    df[split_point_2:].to_csv(f'./datasets/search_products_params_{dataset_size}_{vus[2]}vu.csv', index=False)

In [19]:
products_load_test_data_to_csv(search_products_params_df_50k, 
                               dataset_size='50k')
products_load_test_data_to_csv(search_products_params_df_100k, 
                               dataset_size='100k')
products_load_test_data_to_csv(search_products_params_df, 
                               dataset_size='full')