In [1]:
from transformers import GPT2Tokenizer, GPT2Model
import torch
import torchvision
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

In [2]:
import pandas as pd
cities = pd.read_csv('cities.csv')

In [3]:
cities.head()

Unnamed: 0.1,Unnamed: 0,city_st,lat,lng,population,lat_prompt,lng_prompt,distance_km
0,0,"New York, NY",40.6943,-73.9249,18713220,40.7128,-74.006,7.139212
1,1,"Los Angeles, CA",34.1139,-118.4068,12750807,34.0522,-118.2437,16.513283
2,2,"Chicago, IL",41.8373,-87.6862,8604203,41.8781,-87.6298,6.511529
3,3,"Miami, FL",25.7839,-80.2102,6445545,25.7617,-80.1918,3.080308
4,4,"Dallas, TX",32.7936,-96.7662,5743938,32.7767,-96.797,3.438239


In [4]:
cities = cities[["city_st", "lat", "lng", "population"]]

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Tokenize the city names using the GPT-2 tokenizer
cities['city_tokens'] = cities['city_st'].apply(lambda x: tokenizer.encode(x))

cities.head()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Unnamed: 0,city_st,lat,lng,population,city_tokens
0,"New York, NY",40.6943,-73.9249,18713220,"[3791, 1971, 11, 6645]"
1,"Los Angeles, CA",34.1139,-118.4068,12750807,"[28903, 5652, 11, 7257]"
2,"Chicago, IL",41.8373,-87.6862,8604203,"[25705, 11, 14639]"
3,"Miami, FL",25.7839,-80.2102,6445545,"[41191, 11, 9977]"
4,"Dallas, TX",32.7936,-96.7662,5743938,"[40540, 11, 15326]"


In [6]:
model = GPT2Model.from_pretrained('gpt2')
def get_gpt2_embedding(tokens):
    with torch.no_grad():
        inputs = torch.tensor(tokens).unsqueeze(0)
        outputs = model(inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding
cities['city_embeddings'] = cities['city_tokens'].apply(get_gpt2_embedding)
cities.head()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Unnamed: 0,city_st,lat,lng,population,city_tokens,city_embeddings
0,"New York, NY",40.6943,-73.9249,18713220,"[3791, 1971, 11, 6645]","[-0.17170806, -0.37608987, -0.17624038, -0.017..."
1,"Los Angeles, CA",34.1139,-118.4068,12750807,"[28903, 5652, 11, 7257]","[-0.024157278, -0.20168556, -0.23296228, 0.115..."
2,"Chicago, IL",41.8373,-87.6862,8604203,"[25705, 11, 14639]","[-0.16458298, -0.39560756, -0.29781523, 0.0899..."
3,"Miami, FL",25.7839,-80.2102,6445545,"[41191, 11, 9977]","[0.12791924, -0.2964219, -0.37983736, 0.084224..."
4,"Dallas, TX",32.7936,-96.7662,5743938,"[40540, 11, 15326]","[0.13252692, -0.21221308, -0.3111354, 0.007279..."


In [7]:
cities.to_csv("city_gpt2_embeddings.csv")