# GeoGuessr Artificial Intelligence Model 🌏🌎🌍

In [None]:
# imports
import numpy as np
import cv2 as cv 
import os
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point, Polygon
import folium

# globals
dataset_rel_path = "./data/geotagged_kaggle/streetviews/"
STANDARD_IMAGE_SIZE = 256
europe_bbox = {
    'lat_min': 36.03,
    'lat_max': 71.13,
    'long_min': -10.72,
    'long_max': 41.31
}

## Data Loading 📷

The used dataset is based on Google Streetview captures tagged with the latitude and longitude (geoTagged). The images are organized into folders based on the country they are taken from (the name of the folder is the country code).

This model works with the countries from Europe (from the [UN's point of view](https://www.worldometers.info/geography/how-many-countries-in-europe/)). Excluded Russia as most of its surface is not in Europe

In [None]:
europe_countries_code_list = ["AL", "AD", "AT", "BY", "BE", "BA", "BG", "HR", "CZ", "DK", "EE", "FI", "FR", "DE", "GR", "VA", "HU", "IS", "IE", "IT", "LV", "LI", "LT", "LU", "MT", "MD", "MC", "ME", "NL", "MK", "NO", "PL", "PT", "RO", "SM", "RS", "SK", "SI", "ES", "SE", "CH", "UA", "GB"]

### Data Insights
Let's take a look at the number of images in each folder.

In [None]:
country_nr_images = {}

def count_files_in_dir(dir_path):
    return len(os.listdir(dir_path))

DATASET_ROOT_PATH = os.path.abspath(dataset_rel_path)

for country_folder in os.listdir(DATASET_ROOT_PATH):
    if (country_folder in europe_countries_code_list):
        country_nr_images[country_folder] = len(os.listdir(os.path.join(DATASET_ROOT_PATH, country_folder)))

plt.bar(country_nr_images.keys(), country_nr_images.values())
plt.xlabel("Country")
plt.xticks(rotation=90)
plt.ylabel("Nr of Images")
plt.show()

print("European countries that are not in the dataset: ")
print([elem for elem in list(country_nr_images.keys()) + europe_countries_code_list if elem not in country_nr_images.keys()])

Let's also plot the report of number of images over the total country surface to observe any imbalances.

In [None]:
european_country_surface_areas = {
    'AL': 28748,  # Albania
    'AD': 468,    # Andorra
    'AT': 83879,  # Austria
    'BY': 207600, # Belarus
    'BE': 30528,  # Belgium
    'BA': 51197,  # Bosnia and Herzegovina
    'BG': 110879, # Bulgaria
    'HR': 56594,  # Croatia
    'CY': 9251,   # Cyprus
    'CZ': 78865,  # Czech Republic
    'DK': 42924,  # Denmark
    'EE': 45227,  # Estonia
    'FI': 338424, # Finland
    'FR': 551695, # France
    'DE': 357022, # Germany
    'GR': 131957, # Greece
    'HU': 93030,  # Hungary
    'IS': 103000, # Iceland
    'IE': 70273,  # Ireland
    'IT': 301340, # Italy
    'LV': 64589,  # Latvia
    'LI': 160,    # Liechtenstein
    'LT': 65300,  # Lithuania
    'LU': 2586,   # Luxembourg
    'MK': 25713,  # North Macedonia
    'MT': 316,    # Malta
    'MD': 33843,  # Moldova
    'MC': 2,      # Monaco
    'ME': 13812,  # Montenegro
    'NL': 41543,  # Netherlands
    'NO': 1487290,# Norway
    'PL': 312696, # Poland
    'PT': 92212,  # Portugal
    'RO': 238397, # Romania
    'RU': 17098242,# Russia
    'SM': 61,     # San Marino
    'RS': 77474,  # Serbia
    'SK': 49037,  # Slovakia
    'SI': 20273,  # Slovenia
    'ES': 505992, # Spain
    'SE': 450295, # Sweden
    'CH': 41284,  # Switzerland
    'UA': 603500, # Ukraine
    'GB': 243610, # United Kingdom
    'VA': 0.44,   # Vatican City
}

filtered_european_country_surface_areas = {key: european_country_surface_areas[key] for key in country_nr_images.keys()}

for country_code in country_nr_images.keys():
    filtered_european_country_surface_areas[country_code] = country_nr_images[country_code] / filtered_european_country_surface_areas[country_code] * 100

plt.bar(country_nr_images.keys(), filtered_european_country_surface_areas.values())
plt.xlabel("Country")
plt.xticks(rotation=90)
plt.ylabel("Nr of Images")
plt.show()

Let's visualize the geographic distribution of images

In [28]:
coordinates_data = {
    'path': [],
    'long': [],
    'lat': []
}
for country_folder in os.listdir(DATASET_ROOT_PATH):
    if (country_folder in europe_countries_code_list):
        curr_path = os.path.join(DATASET_ROOT_PATH, country_folder)
        for filename in os.listdir(curr_path):
            curr_lat, curr_long = filename[: -4].split(',')
            coordinates_data['path'].append(os.path.join(curr_path, filename))
            coordinates_data['lat'].append(float(curr_lat))
            coordinates_data['long'].append(float(curr_long))
all_countries_df = pd.DataFrame(coordinates_data)

In [None]:
my_map = None
# map_center = [45, 15]
# my_map = folium.Map(location=map_center, zoom_start=5)


# # Add markers for each coordinate
# for coordinates in list(zip(all_countries_df.lat, all_countries_df.long)):
#     folium.CircleMarker(location=(coordinates[0], coordinates[1]), radius=1).add_to(my_map)

# # Save the map as an HTML file
# my_map.save("map_with_markers.html")
# my_map

> Questions to solve
> - how to treat areas with no data
> - how to create the areas that determine the prediction
>     - how to ensure it's only land (check geopandas) - not so much of a problem though, as long as the square is not only on sea

### Shuffling and splitting into Test, Validation, and Training sets

> Each folder contents will be split into training, validation and test datasets having the percentage 80%, 10%, 10% respectively.

### Storing the training data in memory
#### Data preprocessing
Preprocessing is strongly tied to the choice of the model used (CNN / Vision transformer / Deep neural networks), therefore it may be adapted later in the implementation

Let us first prepare the input data for being fed into a Convolutional Neural Network. This type of network expects a 4D Tensor. For a colored image, this tensor would have the shape
```
[BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, NUMBER_OF_CHANNELS]
```

The images will be preprocessed as follows:
- resize to the STANDARD_SIZE in this case 256 pixels for an edge (square shape)
- normalize the image

In [None]:
# OpenCV understanding on how images are processed

example_img = cv.imread(os.path.join(dataset_rel_path, "CH/45.72656,6.52431.jpg"))
print(type(example_img))
print(np.shape(example_img))

# resize
resized = cv.resize(example_img, (STANDARD_IMAGE_SIZE, STANDARD_IMAGE_SIZE), interpolation=cv.INTER_NEAREST)
print(np.shape(resized))
print(resized[0][0]) # print the color channels B G R

# cv.imshow("original", example_img)
# cv.imshow("resized", resized)
# cv.waitKey(0)

#### Image Labeling, creating the csv dataset, splitting into test, train, validation sets
We need to get from the current state of the data to the data suitable for training, validation and testing.
- *current* state: jpg images aranged into folders and labeled with their latitude and longitude values 
- *desired* state: individual examples consisting of numpy arrays of the required shape, populated with the RGB channels values, and labeled with an integer corresponding to the area on the map where they belong.

Data Structures for the Images:
- x_train, x_validate, x_test: arrays containing the pixel data of the images (will be fed into the neural network)
- y_train, y_validate, y_test: arrays containing the label of each example.

> Note: The label and the example need to be at the same index in the data structures that hold them. Therefore, they need to be shuffled together

##### Obtaining the output labels ⏹
In this method the map of europe will be divided into square shapes, therefore the label of an image is the index of the square into the ordered list of squares. The size of the square is determined by the number of training examples in that square. Therefore if it exceeds the `THRESHOLD` it will be split into 4 (its size is reduced). Obtaining these squares is a subproblem on its own and will be solved in the following part.

In [29]:
INITIAL_SQUARE_EDGE_SIZE = 10
POINTS_IN_SQUARE_TH = 300

all_countries_labeled = []

def isInBbox(point_coords, bottom_left, size):
    if (point_coords[0] > bottom_left[0] and point_coords[0] < bottom_left[0] + size):
        if (point_coords[1] > bottom_left[1] and point_coords[1] < bottom_left[1] + size):
            return True
    return False

def updateLabelForElement(countries_labeled, new_point):
    for index, element in enumerate(countries_labeled):
        if element['path'] == new_point['path']:
            countries_labeled[index]['label'] = new_point['label']
            break
    return countries_labeled

def getSquaresFromArea(countries_labeled, bottom_left, size, points_df, len_squares):
    squares_list = []
    squares_list_len = 0

    bottom_left_x = bottom_left[0]
    while bottom_left_x < (bottom_left[0] + 2 * size):
        bottom_left_y = bottom_left[1]
        while bottom_left_y < (bottom_left[1] + 2 * size):
            count_points = 0
            curr_points_df = []    
            for index, point in all_countries_df.iterrows():
                if isInBbox((point['long'], point['lat']), (bottom_left_x, bottom_left_y), size):
                    count_points += 1
                    curr_points_df.append(point)
                    new_point = {
                        'path': point['path'],
                        'label': len_squares + squares_list_len
                    }
                    countries_labeled = updateLabelForElement(countries_labeled, new_point)

            if count_points >= POINTS_IN_SQUARE_TH:
                # break up the square even more
                print('will break up square ', bottom_left_x, bottom_left_y, size, count_points)
                countries_labeled, squares_list_from_area = (getSquaresFromArea(countries_labeled, (bottom_left_x, bottom_left_y), size / 2, points_df, len_squares + squares_list_len))
                squares_list_len += len(squares_list_from_area)
                squares_list += squares_list_from_area

            elif count_points > 0:
                squares_list.append((bottom_left_x, bottom_left_y, size))
                squares_list_len += 1
            bottom_left_y += size
        bottom_left_x += size
    return countries_labeled, squares_list



squares = []
squares_len = 0

bottom_left_x = europe_bbox['long_min']
while(bottom_left_x < europe_bbox['long_max']):
    bottom_left_y = europe_bbox['lat_min']

    while (bottom_left_y < europe_bbox['lat_max']):
        count_points = 0
        curr_points_df = []
        
        for index, point in all_countries_df.iterrows():
            if isInBbox((point['long'], point['lat']), (bottom_left_x, bottom_left_y), INITIAL_SQUARE_EDGE_SIZE):
                curr_points_df.append(point)
                count_points += 1
                point['label'] = squares_len
                all_countries_labeled.append(point)

        if count_points >= POINTS_IN_SQUARE_TH:
            # break the square even more
            print('will break up square', bottom_left_x, bottom_left_y, INITIAL_SQUARE_EDGE_SIZE, count_points)
            all_countries_labeled, squares_in_area = getSquaresFromArea(all_countries_labeled, (bottom_left_x, bottom_left_y), INITIAL_SQUARE_EDGE_SIZE / 2, pd.DataFrame(curr_points_df), squares_len)
            squares_len += len(squares_in_area)
            squares += squares_in_area

        elif (count_points > 0):
            squares.append((bottom_left_x, bottom_left_y, INITIAL_SQUARE_EDGE_SIZE))
            squares_len += 1
        
        bottom_left_y += INITIAL_SQUARE_EDGE_SIZE
    
    bottom_left_x += INITIAL_SQUARE_EDGE_SIZE
       
labeled_examples_df = pd.DataFrame(all_countries_labeled) # {path to image, label}

will break up square -10.72 36.03 10 744
will break up square  -10.72 36.03 5.0 325
will break up square  -10.72 41.03 5.0 304
will break up square -10.72 46.03 10 678
will break up square  -5.720000000000001 51.03 5.0 338
will break up square -0.7200000000000006 36.03 10 542
will break up square  4.279999999999999 41.03 5.0 319
will break up square -0.7200000000000006 46.03 10 1493
will break up square  -0.7200000000000006 46.03 5.0 384
will break up square  1.7799999999999994 48.53 2.5 324
will break up square  4.279999999999999 46.03 5.0 506
will break up square  4.279999999999999 51.03 5.0 332
will break up square 9.28 36.03 10 383
will break up square 9.28 46.03 10 1384
will break up square  14.28 46.03 5.0 874
will break up square  16.78 48.53 2.5 462
will break up square 19.28 36.03 10 323


In [33]:
print(squares)
labeled_examples_df

[(-10.72, 36.03, 2.5), (-10.72, 38.53, 2.5), (-8.22, 36.03, 2.5), (-8.22, 38.53, 2.5), (-10.72, 41.03, 2.5), (-8.22, 41.03, 2.5), (-8.22, 43.53, 2.5), (-5.720000000000001, 36.03, 5.0), (-5.720000000000001, 41.03, 5.0), (-10.72, 51.03, 5.0), (-5.720000000000001, 46.03, 5.0), (-5.720000000000001, 51.03, 2.5), (-5.720000000000001, 53.53, 2.5), (-3.2200000000000006, 51.03, 2.5), (-3.2200000000000006, 53.53, 2.5), (-10.72, 56.03, 10), (-0.7200000000000006, 36.03, 5.0), (-0.7200000000000006, 41.03, 5.0), (4.279999999999999, 36.03, 5.0), (4.279999999999999, 41.03, 2.5), (4.279999999999999, 43.53, 2.5), (6.779999999999999, 41.03, 2.5), (6.779999999999999, 43.53, 2.5), (-0.7200000000000006, 46.03, 2.5), (-0.7200000000000006, 48.53, 2.5), (1.7799999999999994, 46.03, 2.5), (1.7799999999999994, 48.53, 1.25), (1.7799999999999994, 49.78, 1.25), (3.0299999999999994, 48.53, 1.25), (3.0299999999999994, 49.78, 1.25), (-0.7200000000000006, 51.03, 5.0), (4.279999999999999, 46.03, 2.5), (4.279999999999999,

Unnamed: 0,path,long,lat,label
1681,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,-3.82138,36.75474,7
1682,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,-4.03624,36.75578,7
1683,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,-3.99454,36.82382,7
1684,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,-5.80706,36.86894,2
1685,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,-5.93936,36.97951,2
...,...,...,...,...
6434,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,33.59659,51.83715,58
6435,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,33.49617,51.85592,58
6436,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,33.46536,51.85798,58
6437,c:\Facultate\An_4\An_4_Sem_1\Pattern_Recogniti...,33.46921,51.86084,58


In [32]:
map_center = [45, 15]
my_map = folium.Map(location=map_center, zoom_start=5)

for index, square in enumerate(squares):
    folium.Rectangle(
        bounds=[[square[1], square[0]], [square[1] + square[2], square[0] + square[2]]], # long lat (y x)
        fill=False,
        color='orange',
        fill_color='orange',
        fill_opacity=0.1,
        popup=f'{index}'
    ).add_to(my_map)

# Add markers for each coordinate
for coordinates in list(zip(labeled_examples_df.lat, labeled_examples_df.long, labeled_examples_df.label)):
    folium.CircleMarker(location=(coordinates[0], coordinates[1]), radius=2, popup=coordinates[2]).add_to(my_map)


my_map.save('map_with_grid.html') # squares appear as rectangles due to map projection

(640, 640, 3)


IndexError: index 0 is out of bounds for axis 0 with size 0

#### Dataset Augmentation
The existing images will be added variations including shifted, rotated, zoomed-in/out images, brightened/darkened