# (Try to) Extract Gentrification-Relevant Information from public data using a Large Language Model

_Adapted version of the script to run a gentrification prediction for some Twitter data that Lex is analysing for an AGILE conference submission_

## Libraries

In [26]:
import os
import re

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point
from datetime import datetime

from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa


#from kaggle.api.kaggle_api_extended import KaggleApi  # pip install kaggle
from together import Together  # pip install together


# Easier display options for debugging: 

# Set the display width to a larger value
pd.set_option('display.width', 1000)

# Optionally, set the max column width to avoid truncating column data
pd.set_option('display.max_colwidth', None)

# Optionally, set the max number of columns to show all columns
pd.set_option('display.max_columns', None)

# Create a log with the current time
LOG_FILE = os.path.join("logs", datetime.now().strftime("%Y-%m-%d-%H%M%S.log"))
def log(msg):
    with open(LOG_FILE, 'a') as f:
        f.write(msg)



## Function to get gentrification scores from tweets / etc

In [27]:
# Default prompt used in the Twitter part
_system_prompt = """
You have a deep understanding of neighbourhood character and how it is experienced discussed in public discourse.
I will provide you with some Twitter / X posts ("tweets").
Your task is to analyse each one text and determine the extent to which each Tweet suggests that the neighbourhood is experiencing change.
Specifically:
Read the posts closely and identify any words, phrases, or implications that might indicate signs of neighbourhood change, changing demographics, or neighbourhood ‘revitalisation’.
Consider both explicit and implicit cues. Explicit cues directly mention new businesses or rising prices, while implicit cues might reflect subtle neighbourhood changes.
Assign a score from 1 to 5, where 1 means not suggestive of change and 5 means highly suggestive.
Do not explain any reasoning.
Provide your answer strictly in the format ‘1. Score’, ‘2. Score’, ‘3. Score’, etc., without any additional explanation or commentary.
"""

In [28]:
def get_gentrification_scores(batch_tweets, batch_index=0, system_prompt=_system_prompt, log_file=LOG_FILE, max_tokens=200):
    """
    Retrieves gentrification scores for a batch of tweets using the Together AI API.

    Parameters
    ----------
    batch_tweets : pandas.DataFrame
        A DataFrame containing the tweets for the current batch.
        It must include a 'text' column with the tweet content.
    batch_index : int
        An optional starting index of the current batch.
        This is used to align the predicted sentiments with the original DataFrame indices.
    system_prompt : str
        An optional system prompt to be sent to the Together AI API.
        The tweet texts will be appended to this prompt.

    Returns
    -------
    ids : list of int
        A list of DataFrame indices corresponding to each tweet in the batch.
        These indices align with the main DataFrame from which this batch was drawn.
    sentiments : list of str
        A list of predicted gentrification scores for each tweet in the batch.
        Possible values are 1 (not suggestive of gentrification) to 5 (highly suggestive).
    explanations : list of str
        Optional explanations that the LLM returns giving it's reason for the the chosen score
        (these may or may not happen depending on the prompt, and the LLM's mood!)
    """

    # Prepare the list of tweets
    tweet_list = "\n".join([f"{idx + 1}. {tweet}"
                            for idx, tweet in enumerate(batch_tweets.text.values)])

    # Create the system prompt
    system_prompt = f"{system_prompt}\n\n{tweet_list}"
    #print("PROMPT:", system_prompt, "\n\n")

    # Prepare the messages
    messages = [
        {
            "role": "system",
            "content": system_prompt
        }
    ]

    # Call the API using parameters that ChatGPT recommends for this task
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
        #model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
        messages=messages,
        max_tokens=max_tokens,  # max length of output (in case I get the prompt wront and it talks for ages...)
        temperature=0.2,  # lower for more deterministic
        top_p=0.9,  # ??
        top_k=40,  # ??
        repetition_penalty=1,
        stop=["<|eot_id|>", "<|eom_id|>"],
        #truncate=130560,  # ??
        stream=False  # Set stream to False to get the full response
    )

    # Extract the assistant's reply and get the IDs and scores
    assistant_reply = response.choices[0].message.content.strip()

    # Useful to have a full log for debugging etc
    log(f"{datetime.now().strftime('%Y-%m-%d-%H%M%S')}\n" \
        f"**MESSAGE**\n{messages}\n" \
        f"**RESPONSE**\n{assistant_reply}\n\n")

    # Parse the IDs and scores (and, if available the explanation for the score)
    ids = []
    scores = []

    # Regex pattern to extract the line number, score, and optional text from a line
    #pattern = r'^\s*(\d+)\s*\.\s*(\d+)\s*$'
    #pattern = r'^\s*(\d+)\s*\.\s*(\d+)(?:\s+(.*))?$'

    #pattern = r'''
    #    ^\s*            # Start of the line, allowing for leading whitespace
    #    (\d+)           # Group 1: The line number
    #    \s*[.:]\s*      # '.' or ':' with optional whitespace on both sides
    #    (\d+)           # Group 2: The score
    #    (?:\s*(.*))?    # Optional text after the score, with optional leading whitespace
    #    $               # End of the line
    #
    #'''
    pattern = r'''
        ^\s*              # Start of line, allow leading whitespace
        (\d+)             # Capture Group 1: The line number
        \s*[.:]\s*        # A dot or colon with optional whitespace
        (?:Score:\s+)?    # Optionally match "Score:" followed by one or more spaces
        (\d+)             # Capture Group 2: The score (one or more digits)
        (?:\s*(.*))?      # Optional text after the score (Group 3)
        $                 # End of the line
    '''

    # Desipte being told not to, sometimes the reply starts with 'Here are the scores:'.
    # or 'Here are the analyses:'
    # Remove at, and any whitespace at the start or end
    assistant_reply = re.sub(r'^\s*Here are the scores:\s*', '', assistant_reply).strip()
    assistant_reply = re.sub(r'^\s*Here are the analyses:\s*', '', assistant_reply).strip()

    # Analyse the reply line-by-line
    lines = assistant_reply.strip().split('\n')
    error_count = 0  # Return -1 on an error (and count the number of errors at the same time)
    for i, line in enumerate(lines):
        # Ignore lines that are empty once they have been stripped
        line = line.strip()
        if not line:
            continue

        # Try to match the line
        match = re.match(pattern, line, re.VERBOSE)
        if match:
            # Extract the index and score from the match groups
            index = int(match.group(1))
            score = int(match.group(2))
            log(f"{i} {line}\n\t{index},{score}")
            # Validate the score range
            if 1 <= score <= 5:
                ids.append(index)
                scores.append(score)
            else:
                msg = f"Warning: Score {score} out of range on line {i}: '{line}'"
                print(msg)
                log(msg)
                error_count += 1
        else:
            msg = f"\n*********************\n" \
                  f"Warning: Invalid format on line {i}: '{line}'.\n" \
                  f"The full response was: \n{assistant_reply}\n" \
                  f"*********************\n"
            print(msg)
            log(msg)
            error_count += 1
            break

        if index-1 >= len(batch_tweets):
            msg = f"Found {index} tweets, but there are more lines. Assuming remaining lines are junk and ignoring them."
            log(f)
            print(f)
            break

    if error_count > 0:
        # There was an error, set scores to -1 and assume tweet IDs from
        # 1 to len(batch_tweets).
        scores = [-1] * len(batch_tweets)
        ids = [x + 1 for x in range(len(batch_tweets))]
        # Do I need to also set the indices as the ids array may not have been populated

    # Compute dataframe indices
    df_ids = [batch_index + int(id) - 1 for id in ids]

    #print("ids:", ids)
    #print("scores:", scores)

    assert len(df_ids) == len(scores), f"Length of ids ({len(df_ids)} does not match length of scores ({len(scores)})."
    return df_ids, scores

    ## Use regular expressions to extract the scores
    #matches = re.findall(r"(\d+)\.\s*(1|2|3|4|5)", assistant_reply, re.IGNORECASE)

    ## Check that the numbering is correct (optional)
    ## You can add code here to verify the numbering matches the tweets

    ## Compute the actual DataFrame indices
    #ids = [batch_index + int(idx) - 1 for idx, score in matches]
    #scores = [int(score) for idx, score in matches]
    #assert len(ids) == len(scores)

    #return ids, scores


## Open Twitter data

Currently use a historic library of tweets that were downloaded using various APIs by the researchers.

In [29]:
tweets_df = pd.read_csv(
    os.path.join("..", "data", "uk_tweets", "tweets_to_nick.csv.gz"),
    quotechar='"',
    skipinitialspace=True,
)

In [30]:
print(tweets_df.columns)
tweets_df

Index(['ID', 'tweet_id', 'text', 'lsoa', 'lon', 'lat', 'year', 'month', 'day'], dtype='object')


Unnamed: 0,ID,tweet_id,text,lsoa,lon,lat,year,month,day
0,1,549920567307292673,If I dyed it black nimmy would actually murder me,E01010579,-1.838765,53.858337,2014,12,30
1,2,549919548229832705,So I miss my black hair already,E01010579,-1.838363,53.858491,2014,12,30
2,3,549748079256412160,@TomLeach95 sort of is good enough for me,E01010579,-1.838475,53.858450,2014,12,30
3,4,549746428806189056,@TomLeach95 found him guys don't panic,E01010579,-1.838604,53.858397,2014,12,30
4,5,549745180736516096,Where's my Keighley boy at,E01010579,-1.838533,53.858424,2014,12,30
...,...,...,...,...,...,...,...,...,...
92406,92407,1102277364875124736,How perfect for a casino themed night? Mega QL playing cards with arch for a student ball at Aria Suites. #leedsballoons #casinoballoons #studentnight #playingcards… https://t.co/7LCNavxarn,E01011668,-1.556857,53.814544,2019,3,3
92407,92408,1100050610949443584,Our Butterfly Moments board to encourage meaningful activities and engagement with our residents #happyliving #wellbeing #feelinggood https://t.co/8NQbj5sx2t,E01011668,,,2019,2,25
92408,92409,1092163960961404929,“Mummy I feel like Cinderella leaving the ball at midnight!”.......ooops! ❤️❤️#allinagoodcause #sikhsoldierstatue @ Aria Suite Leeds https://t.co/3i6Jtj3A7s,E01011668,-1.556857,53.814544,2019,2,3
92409,92410,1089199016443871232,A Dj always needs a dope host so big up to my broski and comedian Icy Jones aka @Icy_Jones_Comic hosting as I Dj at our friends Pedro and his wife Angel wedding in Leeds\r\n...\r\nSUBSCRIBE TO… https://t.co/lnRsnH2M0E,E01011668,-1.556857,53.814544,2019,1,26


## Use the Together.AI API to batch classify the Tweets as how likely they are to be related to gentrification.

A function that takes a batch of tweets and uses the Together API to classify them. The `_system_prompt` variable contains the prompt that will be sent to the API; individual tweets are appended to this.

The LLM requires some parameter values. Here are the settings that chatGPT recommended (with defaults, that I decided not to use, in brackets)

1.	max_tokens:
	-	Recommendation: 50 (200)
	-	Explanation: Since you’re expecting short responses like '1. Score', a small max_tokens value ensures concise outputs without unnecessary verbosity.
	-    _(I actually choose a larger value because the prompt has been designed to get the LLM to stop long before max_tokens is reached and this way it is hopefully less likely to break if we increase the batch size.
2.	temperature:
	-	Recommendation: 0.2 (0.7)
	-	Explanation: A lower temperature makes the model’s output more deterministic, which is ideal for scoring tasks where consistency is important.
3.	top_p:
	-	Recommendation: 0.9 (0.7)
	-	Explanation: This value balances the randomness and coherence of the output by considering tokens with a cumulative probability up to 90%.
4.	top_k:
	-	Recommendation: 40 (50)
	-	Explanation: Limits the model to consider the top 40 probable next tokens, which helps in generating relevant responses.
5.	repetition_penalty:
	-	Recommendation: 1.1 (1)
	-	Explanation: Slightly penalizes repeated tokens to prevent the model from producing redundant information.

**Note**: The cell below needs updating now that the `get_gentrification_scores` fuction returns the prediction as well as it's score (used later). This means the twitter LLM prompt may need updating too (or maybe not if the regex that parses the LLM output is flexible enough) but I can't be bothered to go back and fix this as this work is redundant now anyway (moving on from this twitter test).

In [31]:
# Get the API key from a file
with open('together.ai_key.txt', 'r') as f:
    api_key = f.readline().strip()

client = Together(api_key=api_key)

# List of tweets to classify (can sample if I want to)
#df = tweets_df.sample(200).copy()
df = tweets_df.copy()

print(f"Will query the LM for {len(df)} tweets")

#assert len(df) < 1000, "Too many tweets to process in one go. Please reduce the number of tweets."

# Ensure the index is consecutive and ascending
df = df.reset_index(drop=True)
# To store the results
df['gentrification_prediction'] = None

# Batch processing
batch_size = 20
for i in range(0, len(df), batch_size):
    # Get the batch of tweets
    batch_tweets = df.loc[i:i + batch_size - 1, :]

    # Get sentiments using the function
    print(f"Submitting batch {i//len(batch_tweets)+1} of {len(df)//len(batch_tweets)}...")

    ids, sentiments = get_gentrification_scores(batch_tweets, batch_index=i)

    #for idx, score in zip(ids, sentiments):
    #    print(f"\t{idx}: {score}")

    # Update the DataFrame with the predictions
    df.loc[ids, 'gentrification_prediction'] = sentiments

print("Finished")

Will query the LM for 92411 tweets
Submitting batch 1 of 4620...
Submitting batch 2 of 4620...
Submitting batch 3 of 4620...
Submitting batch 4 of 4620...
Submitting batch 5 of 4620...
Submitting batch 6 of 4620...
Submitting batch 7 of 4620...
Submitting batch 8 of 4620...
Submitting batch 9 of 4620...
Submitting batch 10 of 4620...
Submitting batch 11 of 4620...
Submitting batch 12 of 4620...
Submitting batch 13 of 4620...
Submitting batch 14 of 4620...
Submitting batch 15 of 4620...
Submitting batch 16 of 4620...
Submitting batch 17 of 4620...
Submitting batch 18 of 4620...
Submitting batch 19 of 4620...
Submitting batch 20 of 4620...
Submitting batch 21 of 4620...
Submitting batch 22 of 4620...
Submitting batch 23 of 4620...
Submitting batch 24 of 4620...
Submitting batch 25 of 4620...
Submitting batch 26 of 4620...
Submitting batch 27 of 4620...
Submitting batch 28 of 4620...
Submitting batch 29 of 4620...
Submitting batch 30 of 4620...
Submitting batch 31 of 4620...
Submitting ba

See how well that worked

In [32]:
df.loc[:,['text', 'gentrification_prediction']]

Unnamed: 0,text,gentrification_prediction
0,If I dyed it black nimmy would actually murder me,1
1,So I miss my black hair already,1
2,@TomLeach95 sort of is good enough for me,1
3,@TomLeach95 found him guys don't panic,1
4,Where's my Keighley boy at,1
...,...,...
92406,How perfect for a casino themed night? Mega QL playing cards with arch for a student ball at Aria Suites. #leedsballoons #casinoballoons #studentnight #playingcards… https://t.co/7LCNavxarn,2
92407,Our Butterfly Moments board to encourage meaningful activities and engagement with our residents #happyliving #wellbeing #feelinggood https://t.co/8NQbj5sx2t,1
92408,“Mummy I feel like Cinderella leaving the ball at midnight!”.......ooops! ❤️❤️#allinagoodcause #sikhsoldierstatue @ Aria Suite Leeds https://t.co/3i6Jtj3A7s,2
92409,A Dj always needs a dope host so big up to my broski and comedian Icy Jones aka @Icy_Jones_Comic hosting as I Dj at our friends Pedro and his wife Angel wedding in Leeds\r\n...\r\nSUBSCRIBE TO… https://t.co/lnRsnH2M0E,2


In [33]:
df.to_csv("./lex_tweets_genrification.csv")

In [35]:
df.gentrification_prediction.value_counts()

gentrification_prediction
1    82052
2     5909
3     2052
4     2036
5      362
Name: count, dtype: int64