In [25]:
import pandas as pd
import geopandas as gpd
import os
from datetime import datetime
import re
import numpy as np
import folium
from shapely.geometry import Point
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import sys
import networkx as nx
from itertools import combinations
from fuzzywuzzy import fuzz
import re
import html
    
from together import Together  # pip install together
sys.path.insert(1, '../')
from Functions import get_gentrification_scores, map_static, get_gentrification_scores_categorical

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)  # Convert things like &amp; to &
    text = re.sub(r'<br\s*/?>', ' ', text)  # Remove <br> or <br /> tags
    text = re.sub(r'<[^>]+>', '', text)  # Remove other HTML tags
    text = re.sub(r'\s+', ' ', text)  # Collapse repeated whitespace
    return text.strip()

def deduplicate_with_logging(df, lsoa_name):
    if len(df) < 2:
        return df  # Nothing to compare

    G = nx.Graph()
    G.add_nodes_from(df.index)

    for i, j in combinations(df.index, 2):
        score = fuzz.token_set_ratio(df.at[i, 'text'], df.at[j, 'text'])
        if score >= 75:
            G.add_edge(i, j)

    groups = list(nx.connected_components(G))
    unique_indices = [sorted(group)[0] for group in groups]

    # LOG: Print duplicates
    for group in groups:
        if len(group) > 1:
            # print(f"\nDuplicates in {lsoa_name}:")
            for idx in sorted(group):
                text_snippet = df.at[idx, 'text'][:120].replace('\n', ' ')
                # print(f" - {text_snippet}")

    return df.loc[unique_indices]

In [26]:
city = 'manchester'

In [27]:
neighbourhoods = gpd.read_file(f"../../../data/AirbnbData/airbnb-{city}/neighbourhoods.geojson")

In [28]:
lsoas =  gpd.read_file('../../../data/SpatialData/LSOAs_2011/LSOA_2011_EW_BSC_V4.shp')
manc_lads = ['Manchester', 'Rochdale', 'Bolton', 'Bury', 'Wigan', 'Oldham',  'Trafford', 'Salford', 'Tameside', 'Stockport']
pattern = '|'.join(manc_lads)
manchester_lsoas =lsoas[lsoas['LSOA11NMW'].str.contains(pattern)]

# Reproject to a CRS with meters (British National Grid)
manchester_lsoas = manchester_lsoas.to_crs(epsg=27700)

# Define Manchester city centre point (in WGS84, then project)
city_centre_wgs84 = Point(-2.2426, 53.4808)  # approx lat/lon of Manchester city centre
city_centre_point = gpd.GeoSeries([city_centre_wgs84], crs='EPSG:4326').to_crs(epsg=27700).iloc[0]

# Filter polygons within a buffer distance (e.g., 3 km radius)
buffer = city_centre_point.buffer(6000)  # 3000 meters
central_lsoas = manchester_lsoas[manchester_lsoas.intersects(buffer)]

### Join together 4 sets of listings, removing duplicates

In [29]:
listings_mar = pd.read_csv(f"../../../data/AirbnbData/airbnb-{city}/listings_mar24.csv.gz")
listings_jun = pd.read_csv(f"../../../data/AirbnbData/airbnb-{city}/listings_jun24.csv.gz")
listings_sept = pd.read_csv(f"../../../data/AirbnbData/airbnb-{city}/listings_sept24.csv.gz")
listings_dec = pd.read_csv(f"../../../data/AirbnbData/airbnb-{city}/listings_dec24.csv.gz")

In [30]:
# Add a column indicating the source dataframe
listings_mar["source"] = "Mar"
listings_jun["source"] = "Jun"
listings_sept["source"] = "Sept"
listings_dec["source"] = "Dec"

# Concatenate the two dataframes
combined = pd.concat([listings_mar, listings_jun, listings_sept, listings_dec], ignore_index=True)

# Group by listing ID and collect sources
# combined["source"] = combined.groupby(["id", "listing_url", 'name'])["source"].transform(lambda x: ", ".join(sorted(set(x))))
print(len(combined))

# Drop duplicates based on listing ID (keeping the first occurrence)
unique_listings = combined.drop_duplicates(subset=['id'], keep='first').copy()
print(len(unique_listings))
unique_listings = unique_listings.drop_duplicates(subset=['latitude', 'longitude'], keep='first').copy()
print(len(unique_listings))
unique_listings = unique_listings.drop_duplicates(subset=['neighborhood_overview', 'description'], keep='first').copy()
print(len(unique_listings))

26192
10109
9668
8102


### Add text columns combining description and neighbourhood overview

In [31]:
unique_listings['text'] = np.where(
    unique_listings[['description', 'neighborhood_overview']].isna().all(axis=1),  # Check if both are NaN
    np.nan,  # Assign NaN if both are NaN
    unique_listings['description'].fillna('') + " " + unique_listings['neighborhood_overview'].fillna(''))

### Delete listings with no textual description

In [32]:
unique_listings[unique_listings['text'].isna()][["description", "neighborhood_overview","text"]]
unique_listings = unique_listings[unique_listings['text'].notnull()]
print(len(unique_listings))

8101


### Add spatial information to listings

In [33]:
unique_listings['geometry'] = unique_listings.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
unique_listings_gdf = gpd.GeoDataFrame(unique_listings, geometry='geometry', crs="EPSG:4326")
unique_listings_gdf = unique_listings_gdf.to_crs(manchester_lsoas.crs)
unique_listings_gdf = gpd.sjoin(unique_listings_gdf, manchester_lsoas[['LSOA11NM', 'geometry']], how='left', predicate='within')
print(len(unique_listings_gdf))

8101


### Keep only those in central Manchester

In [34]:
buffer = city_centre_point.buffer(6000)  # 3000 meters
unique_listings_gdf = unique_listings_gdf[unique_listings_gdf.intersects(buffer)]
print(len(unique_listings_gdf))

5262


### Get a datarame containing just one entry per LSOA (with textual descriptions combined)

In [35]:
results = []
for lsoa, group in unique_listings_gdf.groupby("LSOA11NM"):
    # check for duplicaed listings
    deduped = deduplicate_with_logging(group, lsoa)
    # Combine all text entries into one string
    combined_text = " ".join(deduped['text'].dropna().astype(str))
    
    # Take LSOA11NM from any row (they should all be the same)
    lsoa_name = deduped['LSOA11NM'].iloc[0]
    
    # Create the new single-row dataframe
    single_row_df = pd.DataFrame({
        'LSOA11NM': [lsoa_name],
        'text': [combined_text]})
    results.append(single_row_df)

# Combine results
deduplicated_listings = pd.concat(results).reset_index(drop=True)
print(len(deduplicated_listings))

309


### Clean weird characters out of text

In [17]:
# Show a few examples before and after cleaning
# for i in range(10,20):
    #print("ORIGINAL:")
    #print(deduplicated_listings.loc[i, 'text'])
    #print("CLEANED:")
    #print(clean_text(deduplicated_listings.loc[i, 'text']))
    #print("="*40)

deduplicated_listings['text'] = deduplicated_listings['text'].apply(clean_text)

## The prompt

In [18]:
prompt_explanatory = f"""
You are an expert in urban studies with a deep understanding of gentrification and its portrayal in public discourse. I will provide you with Airbnb 
listings, including a description of the property and a neighborhood overview. Your task is to analyze these texts and assess the gentrification status 
of the area based on how the neighborhood is presented.

Focus primarily on the neighborhood overview and description, ignoring property-specific details like the number of bedrooms, amenities, or decor. 

Consider the following:
- Direct mentions of local attractions, businesses, or community features that suggest development or revitalization.
- Language that highlights cultural hotspots, boutique shops, trendy cafes, or artisanal markets.
- Descriptions that emphasize diversity, safety, or the presence of creative communities, as these can signal gentrification dynamics.
- Listings that avoid mentioning the neighborhood or speak only broadly about the city may imply that the immediate area lacks desirable features or is 
not a selling point. This absence of detail should inform your assessment.

Assign one of the following categories:
- "Established": A well-known, desirable area with stable appeal and little active change.
- "Gentrifying": Signs of recent or ongoing transformation, such as new businesses or cultural shifts.
- "Emerging": Early indicators of gentrification potential, like creative spaces or gradual commercial interest.
- "Undeveloped": Lacking indicators of gentrification, often reflected in vague or absent neighborhood descriptions.

If a listing genuinely lacks sufficient information to make any assessment (e.g., the text is too sparse), assign a score of 'NA' and briefly explain.

Provide your answer strictly in the format:
'1. Category. Reasoning.', without any additional explanation or commentary. Only provide one score per listing'
"""
prompt = prompt_explanatory

## Decide whether to run LLM
Decide whether to run the LLM or load a file of scores that has been previously calculated and saved

In [19]:
# Check for existing files
base_dir = os.path.expanduser(f"../../../data/AirbnbData/airbnb-{city}/")
base_filename = "airbnb_gentrification_scores_one_per_lsoa"
file_extension = ".gpkg"

# List all matching files in the directory
matching_files = [
    f for f in sorted(os.listdir(base_dir))
    if re.match(f"{base_filename}_\\d{{2}}{file_extension}$", f)]
matching_files

[]

In [20]:
# If there are matching files, find the most recent one
if matching_files:
    df = gpd.read_file(base_dir+matching_files[-1])
    print(f"Loaded file: {matching_files[-1]}")
else:
    # If no matching files are found
    print("No matching files found. 'df' will not be loaded.")
    df = None

No matching files found. 'df' will not be loaded.


In [21]:
# Create a log with the current time
LOG_FILE = os.path.join("../logs", datetime.now().strftime("%Y-%m-%d-%H%M%S.log"))
def log(msg):
    with open(LOG_FILE, 'a') as f:
        f.write(msg)

In [24]:
RUN_LLM = True
if df is not None:
    print("Have already loaded a gentrification file, will not re-run the LLM.")
    RUN_LLM = False

RUN_LLM = True  # Optionally override

if RUN_LLM:
    print("Running LLM")
    # Get the API key from a file
    with open('../together.ai_key.txt', 'r') as f:
        api_key = f.readline().strip()

    client = Together(api_key=api_key)

    # Sample for now?
    df = deduplicated_listings.copy()
    # df = deduplicated_listings.copy()

    print(f"Will query the LM for {len(df)} items")

    assert len(df) < 11000, "Too many tweets to process in one go. Please reduce the number of tweets."

    # Ensure the index is consecutive and ascending
    df = df.reset_index(drop=True)
    # To store the results
    df['gentrification_prediction'] = None

    # Batch processing
    batch_size = 1
    for i in range(0, len(df), batch_size):
        # Get the batch of tweets
        batch_tweets = df.loc[i:i + batch_size - 1, :]

        # Get sentiments using the function
        print(f"Submitting batch {i//len(batch_tweets)+1} of {len(df)//len(batch_tweets)}...")
        #print("-----")
        #print("Sending to LLM:", batch_tweets['text'].tolist())
        #print("-----")
        ids, sentiments, explanations = get_gentrification_scores_categorical_one_per_lsoa(
            batch_tweets, prompt, client, batch_index=i, max_tokens=15000)
        #print("----------------")
        # Update the DataFrame with the predictions
        df.loc[ids, 'gentrification_prediction'] = sentiments
        df.loc[ids, 'explanation'] = explanations

        # Predictions should be integers
        # df.gentrification_prediction = df.gentrification_prediction.astype('Int64')

    print("Finished querying LLM. Now saving file")

    # Initialize counter and check for existing files
    counter = 1
    while True:
        filename = f"{base_filename}_{counter:02d}.csv"
        filepath = os.path.join(base_dir, filename)
        if not os.path.exists(filepath):
            break
        counter += 1
    df.to_csv(f"{filepath}")    
    # df.set_crs(epsg=4326, inplace=True)
    # df.to_file(filepath, layer="data", driver="GPKG")
    print(f"File saved as: {filepath}")

Have already loaded a gentrification file, will not re-run the LLM.
Running LLM
Will query the LM for 309 items
Submitting batch 1 of 309...
Submitting batch 2 of 309...
Submitting batch 3 of 309...
Submitting batch 4 of 309...
Submitting batch 5 of 309...
Submitting batch 6 of 309...
Submitting batch 7 of 309...
Submitting batch 8 of 309...
Submitting batch 9 of 309...
Submitting batch 10 of 309...
Submitting batch 11 of 309...
Submitting batch 12 of 309...
Submitting batch 13 of 309...
Submitting batch 14 of 309...
Submitting batch 15 of 309...
Submitting batch 16 of 309...
Submitting batch 17 of 309...
Submitting batch 18 of 309...
Submitting batch 19 of 309...
Submitting batch 20 of 309...
Submitting batch 21 of 309...
Submitting batch 22 of 309...
Submitting batch 23 of 309...
Submitting batch 24 of 309...
Submitting batch 25 of 309...
Submitting batch 26 of 309...
Submitting batch 27 of 309...
Submitting batch 28 of 309...
Submitting batch 29 of 309...
Submitting batch 30 of 309.