In [20]:
cities = ['Barcelona',
 'Paris',
 'Rome',
 'Amsterdam',
 'Munich',
 'London',
 'Prague',
 'Madrid',
 'Florence',
 'Vienna',
 'Berlin',
 'Lisbon',
 'Budapest',
 'Brussels',
 'Dublin',
 'Zurich',
 'Milan']

In [11]:
len(cities)

17

In [12]:
import pandas as pd
import numpy as n

In [13]:
df = pd.read_csv("../data/airport-codes.csv")
df = df[df.type.isin(['medium_airport', 'large_airport'])]
eu_df = df[df['continent'] == 'EU'][['iata_code', 'name',"coordinates"]]
eu_df.reset_index(inplace=True)
eu_df

Unnamed: 0,index,iata_code,name,coordinates
0,13631,JAM,Bezmer Air Base,"42.4548988342, 26.3521995544"
1,13718,AEY,Akureyri Airport,"65.66000366210938, -18.07270050048828"
2,13729,EGS,Egilsstaðir Airport,"65.2833023071289, -14.401399612426758"
3,13745,HFN,Hornafjörður Airport,"64.295601, -15.2272"
4,13749,HZK,Húsavík Airport,"65.952301, -17.426001"
...,...,...,...,...
1029,73020,,Soltsy-2 Air Base,"58.139545, 30.33042"
1030,73022,,Severomorsk-3 Naval Air Base,"68.866669, 33.716667"
1031,73024,,Fedotovo Naval Air Base,"59.190372, 39.123459"
1032,73132,LNX,Smolensk North Airport,"54.824, 32.025"


In [15]:
geo = {}

for city in cities:
    match = eu_df[eu_df['name'].str.contains(city, case=False, na=False)]
    if not match.empty:
        geo[city] = match['coordinates'].iloc[0]

geo
    

{'Barcelona': '41.2971, 2.07846',
 'Paris': '49.454399, 2.11278',
 'Rome': '53.745098, -2.88306',
 'Amsterdam': '52.308601, 4.76389',
 'Munich': '48.353802, 11.7861',
 'London': '51.874698638916016, -0.36833301186561584',
 'Prague': '50.1008, 14.26',
 'Madrid': '40.471926, -3.56264',
 'Vienna': '48.110298, 16.5697',
 'Berlin': '52.362247, 13.500672',
 'Lisbon': '38.7813, -9.13592',
 'Budapest': '47.42976, 19.261093',
 'Brussels': '50.901402, 4.48444',
 'Dublin': '53.428713, -6.262121',
 'Milan': '45.673901, 9.70417'}

In [34]:
def convert_city_coordinates(cities):
    formatted_cities = {}
    for city, coords in cities.items():
        lat, lng = map(float, coords.split(','))
        formatted_cities[city] = {"label": city, "value": {"lat": lat, "lng": lng}}
    return formatted_cities

# Convert and store the formatted city data
formatted_city_data = convert_city_coordinates(geo)
formatted_city_data

{'Barcelona': {'label': 'Barcelona',
  'value': {'lat': 41.2971, 'lng': 2.07846}},
 'Paris': {'label': 'Paris', 'value': {'lat': 49.454399, 'lng': 2.11278}},
 'Rome': {'label': 'Rome', 'value': {'lat': 53.745098, 'lng': -2.88306}},
 'Amsterdam': {'label': 'Amsterdam',
  'value': {'lat': 52.308601, 'lng': 4.76389}},
 'Munich': {'label': 'Munich', 'value': {'lat': 48.353802, 'lng': 11.7861}},
 'London': {'label': 'London',
  'value': {'lat': 51.874698638916016, 'lng': -0.36833301186561584}},
 'Prague': {'label': 'Prague', 'value': {'lat': 50.1008, 'lng': 14.26}},
 'Madrid': {'label': 'Madrid', 'value': {'lat': 40.471926, 'lng': -3.56264}},
 'Vienna': {'label': 'Vienna', 'value': {'lat': 48.110298, 'lng': 16.5697}},
 'Berlin': {'label': 'Berlin', 'value': {'lat': 52.362247, 'lng': 13.500672}},
 'Lisbon': {'label': 'Lisbon', 'value': {'lat': 38.7813, 'lng': -9.13592}},
 'Budapest': {'label': 'Budapest',
  'value': {'lat': 47.42976, 'lng': 19.261093}},
 'Brussels': {'label': 'Brussels',
  '

In [21]:
import wikipedia
from transformers import pipeline

embedder = pipeline('feature-extraction', model='sentence-transformers/all-MiniLM-L6-v2')

def get_wikipedia_intro(city_name):
    """
    Fetch the introductory paragraph of a city from Wikipedia.
    """
    try:
        # Get the summary (introductory text) of the page
        summary = wikipedia.summary(city_name, sentences=5)  # Adjust the number of sentences as needed
        return summary
    except wikipedia.exceptions.PageError:
        print(f"No page found for {city_name}")
        return None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation error for {city_name}, taking first option: {e.options[0]}")
        return wikipedia.summary(e.options[0], sentences=5)

def get_embeddings(text):
    """
    Generate embeddings for the given text using a pre-trained model.
    """
    if text:
        # Extract the embeddings
        embeddings = embedder(text)
        # embeddings[0] because the pipeline outputs a list with one element per input
        return embeddings[0][0]  # Return the first vector for the first sentence
    return None


city_embeddings = {}

for city in cities:
    intro_text = get_wikipedia_intro(city)
    print(intro_text)
    if intro_text:
        city_embeddings[city] = get_embeddings(intro_text)



Barcelona (  BAR-sə-LOH-nə, Catalan: [bəɾsəˈlonə] , Spanish: [baɾθeˈlona] ) is a city on the northeastern coast of Spain. It is the capital and largest city of the autonomous community of Catalonia, as well as the second-most populous municipality of Spain. With a population of 1.6 million within city limits, its urban area extends to numerous neighbouring municipalities within the province of Barcelona and is home to around 4.8 million people, making it the fifth most populous urban area in the European Union after Paris, the Ruhr area, Madrid and Milan. It is one of the largest metropolises on the Mediterranean Sea, located on the coast between the mouths of the rivers Llobregat and Besòs, bounded to the west by the Serra de Collserola mountain range.
Founded as a Roman city, in the Middle Ages Barcelona became the capital of the County of Barcelona.
Paris is the capital and most populous city of France. With an official estimated population of 2,102,650 residents as of 1 January 202



  lis = BeautifulSoup(html).find_all('li')


Disambiguation error for Florence, taking first option: Flounce (fabric)
In sewing and dressmaking, a ruffle, frill, or furbelow is a strip of fabric, lace or ribbon tightly gathered or pleated on one edge and applied to a garment, bedding, or other textile as a form of trimming.
Ruffles can be made from a single layer of fabric (which may need a hem) or a doubled layer. Plain ruffles are usually cut on the straight grain.
Ruffles may be gathered by using a gathering stitch, or by passing the fabric through a mechanical ruffler, which is an attachment available for some sewing machines.
A flounce is a particular type of fabric manipulation that creates a similar look but with less bulk.
Vietnam,  officially the Socialist Republic of Viet Nam (SRV), is a country at the eastern edge of mainland Southeast Asia, with an area of about 331,000 square kilometres (128,000 sq mi) and a population of over 100 million, making it the world's fifteenth-most populous country. Vietnam shares land bor

In [17]:
!pip install wikipedia

Defaulting to user installation because normal site-packages is not writeable
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=fdbb30dcafb899ebc28b1ec2953b0b9cb7256966c1171479097cd8c0f0b6d289
  Stored in directory: /home/ubuntu/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [22]:
city_embeddings

{'Barcelona': [0.17555880546569824,
  0.23884736001491547,
  0.26594510674476624,
  0.10994534194469452,
  -0.2602458894252777,
  -0.12218531966209412,
  -0.005205793771892786,
  -0.1291397362947464,
  -0.25531670451164246,
  -0.2622792720794678,
  -0.1013539731502533,
  -0.2548556625843048,
  -0.026747720316052437,
  -0.048426996916532516,
  -0.09531357139348984,
  -0.020018095150589943,
  -0.24401503801345825,
  -0.3054732084274292,
  0.11202769726514816,
  0.10053955018520355,
  0.020671483129262924,
  -0.0782637894153595,
  -0.29278820753097534,
  -0.03510715067386627,
  -0.22521758079528809,
  -0.06129991263151169,
  0.06887275725603104,
  0.02174471504986286,
  -0.0068862950429320335,
  -0.8090458512306213,
  -0.0502358078956604,
  -0.1675177812576294,
  0.39161446690559387,
  -0.1617032289505005,
  -0.12176281958818436,
  -0.15518620610237122,
  0.037837184965610504,
  0.2581728994846344,
  -0.11254798620939255,
  0.3750268220901489,
  -0.005941856186836958,
  0.0272107701748609

In [23]:
interests = ["hiking","architecture","opera","beach","skiing","history","clubbing","football","concerts","pizza","beer"]

In [26]:
interest_embeddings = {}

interest_embedder = pipeline('feature-extraction', model='distilbert-base-uncased')

def get_interest_embeddings(text):
    embeddings = interest_embedder(text)
    # The output is a list of lists, one for each token in the text
    # We take the mean to get a single vector for the text
    return [sum(col) / len(col) for col in zip(*embeddings[0])]

for interest in interests:
    interest_embeddings[interest] = get_interest_embeddings(interest)
    
interest_embeddings

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'hiking': [-7.209173361460368,
  -7.350905100504558,
  -7.270342191060384,
  -7.347108523050944,
  -7.351463635762532,
  -7.22862450281779,
  -7.448452949523926,
  -7.308438618977864,
  -7.296233495076497,
  -7.276466369628906,
  -7.22110382715861,
  -7.360222816467285,
  -7.084738890329997,
  -7.269523620605469,
  -7.356494426727295,
  -7.386868476867676,
  -7.251752853393555,
  -7.417946179707845,
  -7.1552886962890625,
  -7.228587468465169,
  -7.2339348793029785,
  -7.367562452952067,
  -7.215031147003174,
  -7.303136666615804,
  -7.2494150797526045,
  -7.320038000742595,
  -7.207589626312256,
  -7.212511698404948,
  -7.203223546346028,
  -7.35700782140096,
  -7.1683400472005205,
  -7.186025460561116,
  -7.2957695325215655,
  -7.4035922686258955,
  -7.16868257522583,
  -7.149410088857015,
  -7.361929257710774,
  -7.418829917907715,
  -7.33476193745931,
  -7.284185886383057,
  -7.466246445973714,
  -7.358289082845052,
  -7.277089595794678,
  -7.196466445922852,
  -7.164074420928955,

In [31]:
def get_embeddings(text, embedder):
    """
    Generate embeddings for the given text using the specified embedder and average across all tokens.
    """
    embeddings = embedder(text)
    # The output is a list of lists, one for each token in the text
    # We take the mean to get a single vector for the text
    return [sum(col) / len(col) for col in zip(*embeddings[0])]

# Using the same model for both for consistency
common_embedder = pipeline('feature-extraction', model='sentence-transformers/all-MiniLM-L6-v2')

# Generating embeddings for cities
city_embeddings = {}
for city in cities:
    intro_text = get_wikipedia_intro(city)
    if intro_text:
        city_embeddings[city] = get_embeddings(intro_text, common_embedder)

# Generating embeddings for interests
interest_embeddings = {}
for interest in interests:
    interest_embeddings[interest] = get_embeddings(interest, common_embedder)




  lis = BeautifulSoup(html).find_all('li')


Disambiguation error for Florence, taking first option: Flounce (fabric)


In [32]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two vectors.
    Args:
    vec1 (list): First vector.
    vec2 (list): Second vector.

    Returns:
    float: Cosine similarity between the two vectors.
    """
    return dot(vec1, vec2) / (norm(vec1) * norm(vec2))

def find_similarities(cities, interests, city_embeddings, interest_embeddings):
    similarities = {}
    for city in cities:
        for interest in interests:
            sim_score = cosine_similarity(city_embeddings[city], interest_embeddings[interest])
            similarities[(city, interest)] = sim_score
    return similarities

similarities = find_similarities(city_embeddings.keys(), interest_embeddings.keys(), city_embeddings, interest_embeddings)

for pair, sim in similarities.items():
    print(f"Similarity between {pair[0]} and {pair[1]}: {sim:.2f}")

Similarity between Barcelona and hiking: -0.07
Similarity between Barcelona and architecture: 0.10
Similarity between Barcelona and opera: 0.08
Similarity between Barcelona and beach: 0.22
Similarity between Barcelona and skiing: -0.03
Similarity between Barcelona and history: -0.03
Similarity between Barcelona and clubbing: 0.11
Similarity between Barcelona and football: 0.04
Similarity between Barcelona and concerts: 0.01
Similarity between Barcelona and pizza: 0.12
Similarity between Barcelona and beer: -0.04
Similarity between Paris and hiking: -0.02
Similarity between Paris and architecture: 0.09
Similarity between Paris and opera: 0.08
Similarity between Paris and beach: 0.14
Similarity between Paris and skiing: -0.05
Similarity between Paris and history: -0.01
Similarity between Paris and clubbing: -0.07
Similarity between Paris and football: 0.02
Similarity between Paris and concerts: 0.06
Similarity between Paris and pizza: 0.02
Similarity between Paris and beer: -0.01
Similar

In [33]:
print(len(city_embeddings['Barcelona']))  
print(len(interest_embeddings['beer']))

384
384


In [35]:
similarities

{('Barcelona', 'hiking'): -0.07356697701118468,
 ('Barcelona', 'architecture'): 0.10132157119581477,
 ('Barcelona', 'opera'): 0.0750066973931789,
 ('Barcelona', 'beach'): 0.22013704457133224,
 ('Barcelona', 'skiing'): -0.02694621132750901,
 ('Barcelona', 'history'): -0.027294328159019297,
 ('Barcelona', 'clubbing'): 0.11107487504935376,
 ('Barcelona', 'football'): 0.042780815338316375,
 ('Barcelona', 'concerts'): 0.010298471553267753,
 ('Barcelona', 'pizza'): 0.12080471090685345,
 ('Barcelona', 'beer'): -0.0434480307824868,
 ('Paris', 'hiking'): -0.021730465969136047,
 ('Paris', 'architecture'): 0.09486664462158673,
 ('Paris', 'opera'): 0.07902341608888479,
 ('Paris', 'beach'): 0.14146735937106836,
 ('Paris', 'skiing'): -0.04848326415004869,
 ('Paris', 'history'): -0.01041568185702632,
 ('Paris', 'clubbing'): -0.06741052443472036,
 ('Paris', 'football'): 0.021670761732746967,
 ('Paris', 'concerts'): 0.06398347958080762,
 ('Paris', 'pizza'): 0.022419092276659416,
 ('Paris', 'beer'): -0.