In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_json(r'tests\data.json')

In [3]:
data.head(2)

Unnamed: 0,city,content
0,Altamonte-Springs,"Only 15 minutes north of downtown Orlando , 30..."
1,Apopka,"The city of Apopka , situated about 25 minutes..."


In [4]:
df = data.copy()

In [5]:
df['city'] = df['city'].apply(lambda x: re.sub('-', ' ', str(x)))

In [6]:
df.head(10)

Unnamed: 0,city,content
0,Altamonte Springs,"Only 15 minutes north of downtown Orlando , 30..."
1,Apopka,"The city of Apopka , situated about 25 minutes..."
2,Bartow,"Aptly nicknamed the “City of Oaks and Azaleas,..."
3,Celebration,"The picture-perfect community of Celebration ,..."
4,Clermont,"Just 35 minutes west of Orlando , the Lake Cou..."
5,Dunnellon,Situated along some of Florida’s most beloved ...
6,Eatonville,Known as the childhood home of renowned Harlem...
7,Kissimmee,"A trip to Kissimmee , just 25 minutes southwes..."
8,Lake Buena Vista,The three-square mile city of Lake Buena Vista...
9,Lake Mary,"Only 18 miles north of Orlando , the peaceful ..."


## Adding location coordenates

In [7]:
from geopy.geocoders import Nominatim   # Nominatim is a tool to convert address into latitude and longitude

In [8]:
geolocator = Nominatim(user_agent="my_geocoder")

In [9]:
def get_lat_long(city):
    try:
        location = geolocator.geocode(f"{city}, Florida, USA")
        return location.latitude, location.longitude
    except:
        return None, None


In [10]:
df['latitud'], df['longitud'] = zip(*df['city'].apply(get_lat_long))  # zip(*df['city'].apply(get_lat_long)) is used to unpack the tuple
df.head()

Unnamed: 0,city,content,latitud,longitud
0,Altamonte Springs,"Only 15 minutes north of downtown Orlando , 30...",28.661915,-81.38859
1,Apopka,"The city of Apopka , situated about 25 minutes...",28.673281,-81.511647
2,Bartow,"Aptly nicknamed the “City of Oaks and Azaleas,...",27.896379,-81.843157
3,Celebration,"The picture-perfect community of Celebration ,...",28.319057,-81.54087
4,Clermont,"Just 35 minutes west of Orlando , the Lake Cou...",28.555191,-81.766949


In [11]:
df['content'][4]

"Just 35 minutes west of Orlando , the Lake County city of Clermont is the place to go for all kinds of outdoor adventures, family-friendly attractions, and a pedestrian- and bike-friendly downtown brimming with restaurants, shops, parks, and festivals on the waterfront. Nicknamed “Choice of Champions,” and known as a training grounds for professional and Olympic athletes, Clermont is home to the state-of-the-art National Training Center , and the Great Florida Triathlon , which draws athletes from all over the world, is held at Waterfront Park on the shores of Lake Minneola. Clermont, a designated Trail Town , is also where outdoor enthusiasts can find everything from boating, fishing, and paddling on the Clermont Chain of Lakes to glamping and camping at Lake Louisa State Park . The Citrus Tower is a Central Florida landmark that affords visitors a bird's-eye view of Clermont’s rolling landscape, and Presidents Hall of Fame is a truly unique roadside attraction featuring replicas of 

In [12]:
def extract_uppercase_words(text):
    # Encontrar el índice inicial del texto a extraer
    indice_inicio = text.find('Plan your trip') + len('Plan your trip')
    texto_extraido = text[indice_inicio:] 
    pattern = r'\b[A-Z][A-Z ,&]+\b'
    matches = re.findall(pattern, texto_extraido)
    return matches

In [13]:
#Things to do collumn
df['things_to_do'] = df['content'].apply(extract_uppercase_words)

In [14]:
df.tail()

Unnamed: 0,city,content,latitud,longitud,things_to_do
22,Wauchula,"Wauchula , roughly 50 miles east of Bradenton ...",27.547259,-81.811471,"[WATERWAYS , FAMILY FUN , CULTURE & HISTORY , ..."
23,Wildwood,"Wildwood , located 50 miles northwest of Orlan...",28.865286,-82.039429,"[WATERWAYS , ARTS & ENTERTAINMENT , OUTDOOR AD..."
24,Winter Garden,"Winter Garden , just 30 minutes west of downto...",28.565665,-81.585674,"[FAMILY FUN , MUSEUMS, ARTS & ENTERTAINMENT , ..."
25,Winter Haven,"Winter Haven , situated an hour southwest of O...",28.022243,-81.732857,"[FAMILY FUN , LEGOLAND , MUSEUMS, ARTS & ENTER..."
26,Winter Park,"Situated just north of Orlando , Winter Park i...",28.597771,-81.351026,"[FAMILY FUN , MUSEUMS, ARTS & ENTERTAINMENT , ..."


In [15]:
def crear_diccionarios(lista_palabras, texto):
    """
    Crea una lista de diccionarios a partir de una lista de palabras clave y un texto.

    Args:
        lista_palabras: Una lista de palabras clave.
        texto: El texto completo.

    Returns:
        Una lista de diccionarios, donde cada diccionario contiene una palabra clave y el texto asociado.
    """

    diccionarios = []
    for palabra in lista_palabras:
        # Patrón para buscar la palabra clave seguida de cualquier texto hasta la siguiente letra mayúscula o el final de la cadena
        pattern = rf"{palabra}(.*?)(?=[A-Z][A-Z]|$)"
        match = re.search(pattern, texto)
        if match:
            diccionarios.append({palabra: match.group(1).strip()})
    return diccionarios

In [16]:
df['t_d'] = df.apply(lambda row: crear_diccionarios(row['things_to_do'], row['content']), axis=1)


In [17]:
df['t_d']

0     [{'FAMILY FUN ': 'Visit the area’s kid-friendl...
1     [{'WATERWAYS ': 'Explore the area’s waterways....
2     [{'WATERWAYS ': 'The famed Peace River begins ...
3     [{'FAMILY FUN ': 'Ride bikes by the water, str...
4     [{'WATERWAYS ': 'Clermont is home to a number ...
5     [{'WATERWAYS ': 'Swim, paddle, and explore the...
6     [{'MUSEUMS, ARTS & ENTERTAINMENT ': 'Learn abo...
7     [{'FAMILY FUN ': 'Explore the theme parks and ...
8     [{'THEME PARKS ': 'The area’s theme parks are ...
9     [{'FAMILY FUN ': 'Visit kid-friendly attractio...
10    [{'FAMILY FUN ': 'Spend time outside and check...
11    [{'WATERWAYS ': 'Fish, boat, and paddle on the...
12    [{'FAMILY FUN ': 'Visit the children’s museum,...
13    [{'WATERWAYS ': 'Take a guided eco-tour or pad...
14    [{'FAMILY FUN ': 'See wild manatees, swim in t...
15    [{'THEME PARKS ': 'Florida is epicenter of the...
16    [{'FL': '.'}, {'WATERWAYS ': 'Fish, boat, and ...
17    [{'FAMILY FUN ': 'Check out family-friendl

In [18]:
df.head()

Unnamed: 0,city,content,latitud,longitud,things_to_do,t_d
0,Altamonte Springs,"Only 15 minutes north of downtown Orlando , 30...",28.661915,-81.38859,"[FAMILY FUN , MUSEUMS, ARTS & ENTERTAINMENT , ...",[{'FAMILY FUN ': 'Visit the area’s kid-friendl...
1,Apopka,"The city of Apopka , situated about 25 minutes...",28.673281,-81.511647,"[WATERWAYS , FAMILY FUN , MUSEUMS, ARTS & ENTE...",[{'WATERWAYS ': 'Explore the area’s waterways....
2,Bartow,"Aptly nicknamed the “City of Oaks and Azaleas,...",27.896379,-81.843157,"[WATERWAYS , MUSEUMS, ARTS & ENTERTAINMENT , O...",[{'WATERWAYS ': 'The famed Peace River begins ...
3,Celebration,"The picture-perfect community of Celebration ,...",28.319057,-81.54087,"[FAMILY FUN , ENTERTAINMENT , OUTDOOR ADVENTUR...","[{'FAMILY FUN ': 'Ride bikes by the water, str..."
4,Clermont,"Just 35 minutes west of Orlando , the Lake Cou...",28.555191,-81.766949,"[WATERWAYS , FAMILY FUN , MUSEUMS, ARTS & ENTE...",[{'WATERWAYS ': 'Clermont is home to a number ...


In [19]:
df_activities = df.explode('t_d').reset_index(drop=True)

In [21]:
df_activities = df_activities[['city', 't_d']]


In [22]:
df_activities

Unnamed: 0,city,t_d
0,Altamonte Springs,{'FAMILY FUN ': 'Visit the area’s kid-friendly...
1,Altamonte Springs,"{'MUSEUMS, ARTS & ENTERTAINMENT ': 'Discover O..."
2,Altamonte Springs,{'OUTDOOR ADVENTURES ': 'Explore the parks and...
3,Altamonte Springs,{'WATERWAYS ': 'Fish and paddle the waterways....
4,Altamonte Springs,{'BEACHES ': 'Take a trip to east coast beache...
...,...,...
296,Winter Park,{'SHOPPING ': 'Explore the upscale shops of tr...
297,Winter Park,{'PET': '-'}
298,Winter Park,{'FRIENDLY ADVENTURES ': 'Visit the pet-friend...
299,Winter Park,{'SPORTS ': 'Hit the links at the city’s 18-ho...


In [24]:
df_activities.to_csv('activities.csv', index=False, encoding='utf-8')