# Unsupervised Track Clustering

## Imports

In [43]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.metrics import edit_distance

import numpy as np
import requests
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarahamiraslani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Sourcing

1. Fetch data from Wikipedia. 

In [2]:
# Fetch and parse the webpage content
url = "https://en.wikipedia.org/wiki/List_of_Formula_One_circuits"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Locate the table with the specified caption
caption = soup.find("caption", text="Formula One circuits\n")
table = caption.find_parent("table")

# Extract the table data
headers = [header.text.strip() for header in table.find_all("th")]
rows = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cells = row.find_all(["td", "th"])
    row_data = [cell.text.strip() for cell in cells]
    rows.append(row_data)

# Convert the data into a pandas DataFrame
wiki_circuits = pd.DataFrame(rows, columns=headers).drop(columns=["Map"])
print("Wikipedia Circuits")
wiki_circuits.head()

Wikipedia Circuits


Unnamed: 0,Circuit,Type,Direction,Location,Country,Last length used,Turns,Grands Prix,Season(s),Grands Prix held
0,Adelaide Street Circuit,Street circuit,Clockwise,Adelaide,Australia,3.780 km (2.349 mi),16,Australian Grand Prix,1985–1995,11
1,Ain-Diab Circuit,Road circuit,Clockwise,Casablanca,Morocco,7.618 km (4.734 mi),18,Moroccan Grand Prix,1958,1
2,Aintree Motor Racing Circuit,Road circuit,Clockwise,Aintree,United Kingdom,4.828 km (3.000 mi),12,British Grand Prix,"1955, 1957, 1959, 1961–1962",5
3,Albert Park Circuit *,Street circuit,Clockwise,Melbourne,Australia,5.278 km (3.280 mi),16,Australian Grand Prix,"1996–2019, 2022–2024",27
4,Algarve International Circuit,Race circuit,Clockwise,Portimão,Portugal,4.653 km (2.891 mi),15,Portuguese Grand Prix,2020–2021,2


2. Pull data from Ergast API (pre-loaded from Kaggle)

In [3]:
ergast_circuits = pd.read_csv("../data/raw/circuits.csv")
print("Ergast API Circuits")
ergast_circuits.head()

Ergast API Circuits


Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park


1. Match track names from the Wikipedia table using text similarity metrics.

- The Levenshtein distance is a text similarity measure that compares two words and returns a numeric value representing the distance between them. The distance reflects the total number of single-character edits required to transform one word into another.

- Jaccard Similarity 
- 
- Cosine similarity

In [54]:
# === Data Prep ===

# Combine the values from both columns
combined_circuits = list(ergast_circuits["name"].values) + list(
    wiki_circuits["Circuit"].values
)

tokenized_corpus = [
    nltk.word_tokenize(sentence.lower()) for sentence in combined_circuits
]
all_words = [word for sentence in tokenized_corpus for word in sentence]
word_freq = Counter(all_words)

num_common_words = 5
common_words = {word for word, count in word_freq.items() if count >= num_common_words}
print(f"Common words: {common_words}")


def remove_common_words(text, common_words):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in common_words]
    return " ".join(filtered_words)


def modify_name(name, common_words):
    return (
        name.str.lower()
        .str.strip()
        .apply(lambda x: remove_common_words(x, common_words))
        # hard-coded removal of common words based on domain knowledge
        .str.replace("grand", "")
        .str.replace("prix", "")
        .str.replace("internacional", "")
        .str.replace("internazionale", "")
        .str.replace("international", "")
        .str.strip()
    )

# Apply the function to both DataFrames
ergast_circuits["modified_name"] = modify_name(ergast_circuits["name"], common_words)
wiki_circuits["modified_name"] = modify_name(wiki_circuits["Circuit"], common_words)

Common words: {'circuito', 'autodromo', 'international', 'street', 'de', 'park', 'raceway', 'autódromo', 'circuit', '*'}


Levenstein

In [55]:
def find_lev_best_match(name, candidates):
    """Function to find the best match based on edit distance"""
    distances = candidates.apply(lambda x: edit_distance(name, x))
    return candidates.loc[distances.idxmin()], distances.min()

# Example applied to Albert Park
print(find_lev_best_match("albert", ergast_circuits["modified_name"]))

# Apply the levenstein function to the entire DataFrame
ergast_circuits[["best_wiki_lev_match", "lev_distance"]] = ergast_circuits[
    "modified_name"
].apply(lambda x: pd.Series(find_lev_best_match(x, wiki_circuits["modified_name"])))

ergast_circuits.head()

('albert', 0)


Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url,modified_name,best_wiki_match,lev_distance,best_wiki_jacc_match,jacc_similarity,best_wiki_lev_match
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,albert,albert,0,albert,1.0,albert
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_Internatio...,sepang,sepang,0,sepang,1.0,sepang
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,bahrain,bahrain,0,bahrain,1.0,bahrain
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...,barcelona-catalunya,barcelona-catalunya,0,barcelona-catalunya,1.0,barcelona-catalunya
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park,istanbul,avus,6,intercity istanbul,0.5,avus


In [57]:
ergast_circuits[["best_wiki_lev_match", "lev_distance"]] = ergast_circuits[
    "modified_name"
].apply(lambda x: pd.Series(find_lev_best_match(x, wiki_circuits["modified_name"])))

ergast_circuits

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url,modified_name,best_wiki_match,lev_distance,best_wiki_jacc_match,jacc_similarity,best_wiki_lev_match
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.84970,144.96800,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,albert,albert,0,albert,1.0,albert
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.73800,18,http://en.wikipedia.org/wiki/Sepang_Internatio...,sepang,sepang,0,sepang,1.0,sepang
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.03250,50.51060,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,bahrain,bahrain,0,bahrain,1.0,bahrain
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57000,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...,barcelona-catalunya,barcelona-catalunya,0,barcelona-catalunya,1.0,barcelona-catalunya
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.95170,29.40500,130,http://en.wikipedia.org/wiki/Istanbul_Park,istanbul,avus,6,intercity istanbul,0.5,avus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,75,portimao,Autódromo Internacional do Algarve,Portimão,Portugal,37.22700,-8.62670,108,http://en.wikipedia.org/wiki/Algarve_Internati...,do algarve,algarve,3,internacional do rio janeiro,0.4,algarve
73,76,mugello,Autodromo Internazionale del Mugello,Mugello,Italy,43.99750,11.37190,255,http://en.wikipedia.org/wiki/Mugello_Circuit,del mugello,del mugello,0,internazionale del mugello,1.0,del mugello
74,77,jeddah,Jeddah Corniche Circuit,Jeddah,Saudi Arabia,21.63190,39.10440,15,http://en.wikipedia.org/wiki/Jeddah_Street_Cir...,jeddah corniche,jeddah corniche,0,jeddah corniche,1.0,jeddah corniche
75,78,losail,Losail International Circuit,Al Daayen,Qatar,25.49000,51.45420,12,http://en.wikipedia.org/wiki/Losail_Internatio...,losail,lusail,1,adelaide,0.0,lusail


Jaccard

In [47]:
def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)


def find_jacc_best_match(name, candidates):
    """Function to find the best match based on Jaccard similarity"""
    similarities = candidates.apply(lambda x: jaccard_similarity(name, x))
    return candidates.loc[similarities.idxmax()], similarities.max()

# Example applied to Albert Park
print(find_jacc_best_match("albert", ergast_circuits["modified_name"]))

ergast_circuits[["best_wiki_jacc_match", "jacc_similarity"]] = ergast_circuits[
    "modified_name"
].apply(lambda x: pd.Series(find_jacc_best_match(x, wiki_circuits["modified_name"])))

('albert', 1.0)


In [48]:
ergast_circuits

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url,modified_name,best_wiki_match,lev_distance,best_wiki_jacc_match,jacc_similarity
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.84970,144.96800,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...,albert,albert,0,albert,1.0
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.73800,18,http://en.wikipedia.org/wiki/Sepang_Internatio...,sepang,sepang,0,sepang,1.0
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.03250,50.51060,7,http://en.wikipedia.org/wiki/Bahrain_Internati...,bahrain,bahrain,0,bahrain,1.0
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57000,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...,barcelona-catalunya,barcelona-catalunya,0,barcelona-catalunya,1.0
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.95170,29.40500,130,http://en.wikipedia.org/wiki/Istanbul_Park,istanbul,avus,6,intercity istanbul,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,75,portimao,Autódromo Internacional do Algarve,Portimão,Portugal,37.22700,-8.62670,108,http://en.wikipedia.org/wiki/Algarve_Internati...,internacional do algarve,internacional do rio janeiro,10,internacional do rio janeiro,0.4
73,76,mugello,Autodromo Internazionale del Mugello,Mugello,Italy,43.99750,11.37190,255,http://en.wikipedia.org/wiki/Mugello_Circuit,internazionale del mugello,internazionale del mugello,0,internazionale del mugello,1.0
74,77,jeddah,Jeddah Corniche Circuit,Jeddah,Saudi Arabia,21.63190,39.10440,15,http://en.wikipedia.org/wiki/Jeddah_Street_Cir...,jeddah corniche,jeddah corniche,0,jeddah corniche,1.0
75,78,losail,Losail International Circuit,Al Daayen,Qatar,25.49000,51.45420,12,http://en.wikipedia.org/wiki/Losail_Internatio...,losail,lusail,1,adelaide,0.0


In [None]:
# Create a new DataFrame to store the matches
matches = []

for idx, row in ergast_circuits.iterrows():
    best_match = find_best_match(row["modified_name"], wiki_circuits["Circuit"])

    matches.append(
        {
            "circuit_id": row["circuitId"],
            "circuit_name": row["name"],
            "wiki_id": wiki_circuits.loc[
                wiki_circuits["Circuit"] == best_match, "Circuit"
            ].values[0],
            "wiki_name": best_match,
        }
    )

matched_df = pd.DataFrame(matches)

In [None]:
matched_df

4. Impute missing number of turn values with K nearest neighbors imputer.