In [2]:
import pandas as pd
df = pd.read_csv("cities.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,city_st,lat,lng,population,lat_prompt,lng_prompt,distance_km
0,0,"New York, NY",40.6943,-73.9249,18713220,40.7128,-74.006,7.139212
1,1,"Los Angeles, CA",34.1139,-118.4068,12750807,34.0522,-118.2437,16.513283
2,2,"Chicago, IL",41.8373,-87.6862,8604203,41.8781,-87.6298,6.511529
3,3,"Miami, FL",25.7839,-80.2102,6445545,25.7617,-80.1918,3.080308
4,4,"Dallas, TX",32.7936,-96.7662,5743938,32.7767,-96.797,3.438239


In [10]:
from geopy.distance import geodesic
import numpy as np

def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def generate_random_triplets(df, num_triplets=1000):
    columns = ['City1', 'City1_long', 'City1_lat',
               'City2', 'City2_long', 'City2_lat',
               'City3', 'City3_long', 'City3_lat']
    data = {col: [] for col in columns}
    
    while len(data['City1']) < num_triplets:
        sample = df.sample(n=3).reset_index(drop=True)
        
        distance1_2 = calculate_distance(sample.at[0, 'lat'], sample.at[0, 'lng'],
                                         sample.at[1, 'lat'], sample.at[1, 'lng'])
        distance1_3 = calculate_distance(sample.at[0, 'lat'], sample.at[0, 'lng'],
                                         sample.at[2, 'lat'], sample.at[2, 'lng'])
        
        if abs(distance1_2 - distance1_3) > 100:
            for i in range(3):
                city = sample.at[i, 'city_st']
                
                data[f'City{i+1}'].append(city)
                data[f'City{i+1}_long'].append(sample.at[i, 'lng'])
                data[f'City{i+1}_lat'].append(sample.at[i, 'lat'])
    
    triplet_df = pd.DataFrame(data)
    return triplet_df

triplets = generate_random_triplets(df)
triplets.head()


Unnamed: 0,City1,City1_long,City1_lat,City2,City2_long,City2_lat,City3,City3_long,City3_lat
0,"Goldsboro, NC",-77.972,35.3778,"St. Cloud, MN",-94.1718,45.5339,"Stamford, CT",-73.5583,41.1035
1,"Woodbury, MN",-92.923,44.9056,"Medford, MA",-71.1087,42.4234,"Des Plaines, IL",-87.9009,42.0345
2,"Muncie, IN",-85.395,40.1989,"Roswell, GA",-84.3513,34.0391,"Houma, LA",-90.7058,29.5799
3,"Glendora, CA",-117.8468,34.1449,"Lafayette, LA",-92.0323,30.2084,"Dublin, CA",-121.8963,37.7161
4,"San Juan, PR",-66.0636,18.4037,"Rome, GA",-85.1862,34.2661,"Federal Way, WA",-122.3358,47.3091


In [11]:
triplets.to_csv("triplets.csv")

In [12]:
prompts = triplets.apply(lambda row: f"Is {row['City1']} closer to {row['City2']} or {row['City3']}?", axis=1)
prompts = pd.DataFrame(prompts, columns=['Prompt'])
prompts.head()

Unnamed: 0,Prompt
0,"Is Goldsboro, NC closer to St. Cloud, MN or St..."
1,"Is Woodbury, MN closer to Medford, MA or Des P..."
2,"Is Muncie, IN closer to Roswell, GA or Houma, LA?"
3,"Is Glendora, CA closer to Lafayette, LA or Dub..."
4,"Is San Juan, PR closer to Rome, GA or Federal ..."


In [13]:
prompts.to_csv('prompts.csv')

In [28]:
# now we manually let GPT populate our prompts.csv file, as we read it in again
prompts = pd.read_csv('prompts.csv')

In [29]:
prompts.head()

Unnamed: 0,Prompt,Response
0,Is Goldsboro NC closer to St. Cloud MN or Stam...,"Stamford, CT"
1,Is Woodbury MN closer to Medford MA or Des Pla...,"Des Plaines, IL"
2,Is Muncie IN closer to Roswell GA or Houma LA?,"Roswell, GA"
3,Is Glendora CA closer to Lafayette LA or Dubli...,"Dublin, CA"
4,Is San Juan PR closer to Rome GA or Federal Wa...,"Federal Way, WA"


In [30]:
ultimate = pd.concat([triplets, prompts], axis=1)
ultimate.head()

Unnamed: 0,City1,City1_long,City1_lat,City2,City2_long,City2_lat,City3,City3_long,City3_lat,Prompt,Response
0,"Goldsboro, NC",-77.972,35.3778,"St. Cloud, MN",-94.1718,45.5339,"Stamford, CT",-73.5583,41.1035,Is Goldsboro NC closer to St. Cloud MN or Stam...,"Stamford, CT"
1,"Woodbury, MN",-92.923,44.9056,"Medford, MA",-71.1087,42.4234,"Des Plaines, IL",-87.9009,42.0345,Is Woodbury MN closer to Medford MA or Des Pla...,"Des Plaines, IL"
2,"Muncie, IN",-85.395,40.1989,"Roswell, GA",-84.3513,34.0391,"Houma, LA",-90.7058,29.5799,Is Muncie IN closer to Roswell GA or Houma LA?,"Roswell, GA"
3,"Glendora, CA",-117.8468,34.1449,"Lafayette, LA",-92.0323,30.2084,"Dublin, CA",-121.8963,37.7161,Is Glendora CA closer to Lafayette LA or Dubli...,"Dublin, CA"
4,"San Juan, PR",-66.0636,18.4037,"Rome, GA",-85.1862,34.2661,"Federal Way, WA",-122.3358,47.3091,Is San Juan PR closer to Rome GA or Federal Wa...,"Federal Way, WA"


In [33]:
def check_response_correctness(row):
    city1_coords = (row['City1_lat'], row['City1_long'])
    city2_coords = (row['City2_lat'], row['City2_long'])
    city3_coords = (row['City3_lat'], row['City3_long'])

    distance_city1_city2 = geodesic(city1_coords, city2_coords).kilometers
    distance_city1_city3 = geodesic(city1_coords, city3_coords).kilometers

    response_city, response_state = row['Response'].split(',')
    response_coords = city2_coords if response_city.strip() == row['City2'].strip() else city3_coords

    is_correct = response_coords == (city2_coords if distance_city1_city2 < distance_city1_city3 else city3_coords)

    return 1 if is_correct else 0

ultimate['correct'] = ultimate.apply(check_response_correctness, axis=1)
ultimate.head()

Unnamed: 0,City1,City1_long,City1_lat,City2,City2_long,City2_lat,City3,City3_long,City3_lat,Prompt,Response,correct
0,"Goldsboro, NC",-77.972,35.3778,"St. Cloud, MN",-94.1718,45.5339,"Stamford, CT",-73.5583,41.1035,Is Goldsboro NC closer to St. Cloud MN or Stam...,"Stamford, CT",1
1,"Woodbury, MN",-92.923,44.9056,"Medford, MA",-71.1087,42.4234,"Des Plaines, IL",-87.9009,42.0345,Is Woodbury MN closer to Medford MA or Des Pla...,"Des Plaines, IL",1
2,"Muncie, IN",-85.395,40.1989,"Roswell, GA",-84.3513,34.0391,"Houma, LA",-90.7058,29.5799,Is Muncie IN closer to Roswell GA or Houma LA?,"Roswell, GA",0
3,"Glendora, CA",-117.8468,34.1449,"Lafayette, LA",-92.0323,30.2084,"Dublin, CA",-121.8963,37.7161,Is Glendora CA closer to Lafayette LA or Dubli...,"Dublin, CA",1
4,"San Juan, PR",-66.0636,18.4037,"Rome, GA",-85.1862,34.2661,"Federal Way, WA",-122.3358,47.3091,Is San Juan PR closer to Rome GA or Federal Wa...,"Federal Way, WA",0


In [36]:
total_correct = ultimate['correct'].sum()
total_responses = len(df)
percentage_correct = (total_correct / total_responses) * 100
print(percentage_correct)

48.99598393574297


In [37]:
ultimate.to_csv('ultimate.csv')

In [47]:
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def calc_per(df, distance):

    num_correct = 0
    
    for _, row in df.iterrows():
        distance1_2 = calculate_distance(row['City1_lat'], row['City1_long'],
                                         row['City2_lat'], row['City2_long'])
        distance1_3 = calculate_distance(row['City3_lat'], row['City3_long'],
                                         row['City3_lat'], row['City3_long'])
        
        if abs(distance1_2 - distance1_3) > distance and row['correct']:
            num_correct += 1
    
    return num_correct / len(df)

print(calc_per(ultimate, 0))
print(calc_per(ultimate, 100))
print(calc_per(ultimate, 200))
print(calc_per(ultimate, 300))
print(calc_per(ultimate, 500))
print(calc_per(ultimate, 1000))
print(calc_per(ultimate, 2000))

0.488
0.488
0.487
0.487
0.484
0.469
0.351


In [2]:
# now we try something new
import pandas as pd
ultimate2 = pd.read_csv("ultimate.csv")
ultimate2.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0.1,Unnamed: 0,City1,City1_long,City1_lat,City2,City2_long,City2_lat,City3,City3_long,City3_lat,Prompt,Response,correct
0,0,"Goldsboro, NC",-77.972,35.3778,"St. Cloud, MN",-94.1718,45.5339,"Stamford, CT",-73.5583,41.1035,Is Goldsboro NC closer to St. Cloud MN or Stam...,"Stamford, CT",1
1,1,"Woodbury, MN",-92.923,44.9056,"Medford, MA",-71.1087,42.4234,"Des Plaines, IL",-87.9009,42.0345,Is Woodbury MN closer to Medford MA or Des Pla...,"Des Plaines, IL",1
2,2,"Muncie, IN",-85.395,40.1989,"Roswell, GA",-84.3513,34.0391,"Houma, LA",-90.7058,29.5799,Is Muncie IN closer to Roswell GA or Houma LA?,"Roswell, GA",0
3,3,"Glendora, CA",-117.8468,34.1449,"Lafayette, LA",-92.0323,30.2084,"Dublin, CA",-121.8963,37.7161,Is Glendora CA closer to Lafayette LA or Dubli...,"Dublin, CA",1
4,4,"San Juan, PR",-66.0636,18.4037,"Rome, GA",-85.1862,34.2661,"Federal Way, WA",-122.3358,47.3091,Is San Juan PR closer to Rome GA or Federal Wa...,"Federal Way, WA",0


In [2]:
import pandas as pd
ultimate2 = pd.read_csv("ultimate2.csv")
ultimate2.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,City1,City1_long,City1_lat,City2,City2_long,City2_lat,City3,City3_long,City3_lat,Prompt,Response,correct
0,"Goldsboro, NC",-77.972,35.3778,"St. Cloud, MN",-94.1718,45.5339,"Stamford, CT",-73.5583,41.1035,Is Goldsboro NC closer to St. Cloud MN or Stam...,"Stamford, CT",1
1,"Woodbury, MN",-92.923,44.9056,"Medford, MA",-71.1087,42.4234,"Des Plaines, IL",-87.9009,42.0345,Is Woodbury MN closer to Medford MA or Des Pla...,"Des Plaines, IL",1
2,"Muncie, IN",-85.395,40.1989,"Roswell, GA",-84.3513,34.0391,"Houma, LA",-90.7058,29.5799,Is Muncie IN closer to Roswell GA or Houma LA?,"Roswell, GA",0
3,"Glendora, CA",-117.8468,34.1449,"Lafayette, LA",-92.0323,30.2084,"Dublin, CA",-121.8963,37.7161,Is Glendora CA closer to Lafayette LA or Dubli...,"Dublin, CA",1
4,"San Juan, PR",-66.0636,18.4037,"Rome, GA",-85.1862,34.2661,"Federal Way, WA",-122.3358,47.3091,Is San Juan PR closer to Rome GA or Federal Wa...,"Rome, GA",0


In [3]:
from geopy.distance import geodesic

def calculate_distances(row):
    city1_coords = (row['City1_lat'], row['City1_long'])
    city2_coords = (row['City2_lat'], row['City2_long'])
    city3_coords = (row['City3_lat'], row['City3_long'])

    distance_to_city2 = geodesic(city1_coords, city2_coords).kilometers
    distance_to_city3 = geodesic(city1_coords, city3_coords).kilometers

    closer_city = row['City2'] if distance_to_city2 < distance_to_city3 else row['City3']

    return closer_city

ultimate2['Closer_City'] = ultimate2.apply(calculate_distances, axis=1)
ultimate2['correct'] = ultimate2['Response'] == ultimate2['Closer_City']
ultimate2.head()


Unnamed: 0,City1,City1_long,City1_lat,City2,City2_long,City2_lat,City3,City3_long,City3_lat,Prompt,Response,correct,Closer_City
0,"Goldsboro, NC",-77.972,35.3778,"St. Cloud, MN",-94.1718,45.5339,"Stamford, CT",-73.5583,41.1035,Is Goldsboro NC closer to St. Cloud MN or Stam...,"Stamford, CT",True,"Stamford, CT"
1,"Woodbury, MN",-92.923,44.9056,"Medford, MA",-71.1087,42.4234,"Des Plaines, IL",-87.9009,42.0345,Is Woodbury MN closer to Medford MA or Des Pla...,"Des Plaines, IL",True,"Des Plaines, IL"
2,"Muncie, IN",-85.395,40.1989,"Roswell, GA",-84.3513,34.0391,"Houma, LA",-90.7058,29.5799,Is Muncie IN closer to Roswell GA or Houma LA?,"Roswell, GA",True,"Roswell, GA"
3,"Glendora, CA",-117.8468,34.1449,"Lafayette, LA",-92.0323,30.2084,"Dublin, CA",-121.8963,37.7161,Is Glendora CA closer to Lafayette LA or Dubli...,"Dublin, CA",True,"Dublin, CA"
4,"San Juan, PR",-66.0636,18.4037,"Rome, GA",-85.1862,34.2661,"Federal Way, WA",-122.3358,47.3091,Is San Juan PR closer to Rome GA or Federal Wa...,"Rome, GA",True,"Rome, GA"


In [6]:
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def calc_per(df, distance):
    skip = 0
    num_correct = 0
    
    for _, row in df.iterrows():
        distance1_2 = calculate_distance(row['City1_lat'], row['City1_long'],
                                         row['City2_lat'], row['City2_long'])
        distance1_3 = calculate_distance(row['City1_lat'], row['City1_long'],
                                         row['City3_lat'], row['City3_long'])
        
        if abs(distance1_2 - distance1_3) > distance:
            if row['correct']:
                num_correct += 1
        else:
            skip += 1
    
    return num_correct / (len(df) - skip)
print(calc_per(ultimate2, 0))
print(calc_per(ultimate2, 100))
print(calc_per(ultimate2, 200))
print(calc_per(ultimate2, 300))
print(calc_per(ultimate2, 400))
print(calc_per(ultimate2, 500))
print(calc_per(ultimate2, 600))
print(calc_per(ultimate2, 700))
print(calc_per(ultimate2, 800))
print(calc_per(ultimate2, 900))
print(calc_per(ultimate2, 1000))

    

0.557
0.557
0.5612353567625133
0.5662921348314607
0.5667074663402693
0.5684210526315789
0.5688202247191011
0.5688622754491018
0.5780998389694042
0.5899653979238755
0.5978062157221207


In [5]:
ultimate2.to_csv("ultimate2.csv")