# (Try to) Extract Gentrification-Relevant Tweets using a Large Language Model

Apply an LLM to a dataset of historic tweets to try to extract those that might be indicative of some kind of gentrification.

## Libraries

In [2]:
import os
import re

import numpy as np
import pandas as pd

#from kaggle.api.kaggle_api_extended import KaggleApi  # pip install kaggle
from together import Together  # pip install together

# Access to Hugging Face data
#conda install huggingface::datasets
#conda install huggingface::huggingface_hub
#from huggingface_hub import login
#from datasets import load_dataset


# Easier display options for debugging: 

# Set the display width to a larger value
pd.set_option('display.width', 1000)

# Optionally, set the max column width to avoid truncating column data
pd.set_option('display.max_colwidth', None)

# Optionally, set the max number of columns to show all columns
pd.set_option('display.max_columns', None)

## Open test data

Currently use a historic library of tweets that were downloaded using various APIs by the researchers.

In [26]:
tweets_df = pd.read_csv(os.path.join("..", "data", "uk_tweets", "uk_tweets-2017-02-15.sample.csv"),
                        quotechar='"',
                        skipinitialspace=True)

In [28]:
print(tweets_df.columns)
tweets_df

Index(['id', 'user-id', 'user-screen_name', 'geo-coordinates-0', 'geo-coordinates-1', 'coordinates-coordinates-0', 'coordinates-coordinates-1', 'place-full_name', 'created_at', 'text', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second'], dtype='object')


Unnamed: 0,id,user-id,user-screen_name,geo-coordinates-0,geo-coordinates-1,coordinates-coordinates-0,coordinates-coordinates-1,place-full_name,created_at,text,Year,Month,Day,Hour,Minute,Second
0,772840590551678977,11800052,jasonmackenzie,,,,,Birmingham_ England,Mon Sep 05 16:55:30 +0000 2016,Jane_ you will bring excellent insights and valuable expertise to the @CIPR_UK council. Go for it! https://t.co/RXR555Frmj,2016,9,5,16,55,30
1,775996824461148160,81434063,didlix,,,,,Lewisham_ London,Wed Sep 14 09:57:14 +0000 2016,@jnnfrrss oh it is in certain ways. Makes wonderful cheese and onion seasoning for example. :),2016,9,14,9,57,14
2,723175917120442368,19284077,HenryGJeffreys,,,,,City of London_ London,Thu Apr 21 15:45:49 +0000 2016,@Freddygray31 @millsswift oh good_ cos I worry about him_,2016,4,21,15,45,49
3,697925095877177344,3927633898,luke_allana,,,,,Whitley Bay_ England,Thu Feb 11 23:28:04 +0000 2016,Hearing someone mention someone and just thinking 'is she still alive?',2016,2,11,23,28,4
4,728891147230298112,189255563,Pitchside_Mark,,,,,Tottenham_ London,Sat May 07 10:16:06 +0000 2016,Kirkland held some bloody licks,2016,5,7,10,16,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,700402173042098176,307405406,StephenMBrooks1,,,,,Hinckley_ England,Thu Feb 18 19:31:05 +0000 2016,Join me this time tomorrow night https://t.co/RVhCAt7LGQ for 2 hours of the best in #Motown_ soul &amp; blues #UKsopro https://t.co/LUoEh5qsfT,2016,2,18,19,31,5
9996,704421706845298689,260410615,Gweneirawyn,,,,,Wales_ United Kingdom,Mon Feb 29 21:43:17 +0000 2016,@pennybel Night night Val.Varied day Hospital am_Cleaning pm and line dancing tonight xx,2016,2,29,21,43,17
9997,815206457817911296,219659354,Irishbhoy1916,,,,,Kildare_ Ireland,Sat Dec 31 14:42:20 +0000 2016,@Coral @jirabob They will gain lots as well with jokes this funny :):),2016,12,31,14,42,20
9998,816050346464280576,3376804072,mikethomas501,,,,,Wakefield_ England,Mon Jan 02 22:35:38 +0000 2017,@ZaclangtonXXX not me,2017,1,2,22,35,38


## Use the Together.AI API to batch classify them as how likely they are to be related to gentrification. 

A function that takes a batch of tweets and uses the Together API to classify them. The `_system_prompt` variable contains the prompt that will be sent to the API; individual tweets are appended to this.

In [35]:
_system_prompt = """
Analyze the following tweets to determine their relevance to gentrification. 
Consider indicators such as mentions of urban development, demographic shifts, displacement concerns, socioeconomic changes, and cultural transformations. 
Assign a score from 1 to 5, where 1 means not suggestive of gentrification and 5 means highly suggestive.
Provide your answer in the format '1. Score', '2. Score', etc.
""" 
#Provide a brief explanation citing specific words or phrases from the tweet that support your classification."""

def get_gentrification_scores(batch_tweets, batch_index=0, system_prompt=_system_prompt):
    """
    Retrieves gentrification scores for a batch of tweets using the Together AI API.

    Parameters
    ----------
    batch_tweets : pandas.DataFrame
        A DataFrame containing the tweets for the current batch. 
        It must include a 'text' column with the tweet content.
    batch_index : int
        An optional starting index of the current batch. 
        This is used to align the predicted sentiments with the original DataFrame indices.
    system_prompt : str
        An optional system prompt to be sent to the Together AI API.
        The tweet texts will be appended to this prompt.

    Returns
    -------
    ids : list of int
        A list of DataFrame indices corresponding to each tweet in the batch. 
        These indices align with the main DataFrame from which this batch was drawn.
    sentiments : list of str
        A list of predicted gentrification scores for each tweet in the batch. 
        Possible values are 1 (not suggestive of gentrification) to 5 (highly suggestive).
    """

    # Prepare the list of tweets
    tweet_list = "\n".join([f"{idx+1}. {tweet}" 
                            for idx, tweet in enumerate(batch_tweets.text.values)])
    
    # Create the system prompt
    system_prompt = f"{system_prompt}\n\n{tweet_list}"
    #print("PROMPT:", system_prompt, "\n\n")

    # Prepare the messages
    messages = [
        {
            "role": "system",
            "content": system_prompt
        }
    ]

    
    # Call the API
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
        messages=messages,
        max_tokens=300,
        temperature=0.7,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>", "<|eom_id|>"],
        truncate=130560,
        stream=False  # Set stream to False to get the full response
    )

    # Extract the assistant's reply
    assistant_reply = response.choices[0].message.content.strip()

    XXXX HERE: THE output from the LLM is more verbose than I want, need to (1) refine the prompt to make it just return scores, (2) fix the pattern matcher.

    # Use regular expressions to extract the scores
    matches = re.findall(r"(\d+)\.\s*(1|2|3|4|5)", assistant_reply, re.IGNORECASE)

    # Check that the numbering is correct (optional)
    # You can add code here to verify the numbering matches the tweets

    # Compute the actual DataFrame indices
    ids = [batch_index + int(idx) - 1 for idx, score in matches]
    scores = [int(score) for idx, score in matches]
    assert len(ids) == len(scores)

    return ids, scores



# Get the API key from a file
with open('together.ai_key.txt', 'r') as f:
    api_key = f.readline().strip()

client = Together(api_key=api_key)

# List of tweets to classify (only a few for now)
df = tweets_df.sample(10).copy()
# Ensure the index is consecutive and ascending
df = df.reset_index(drop=True)
# To store the results
df['gentrification_prediction'] = None

# Batch processing
batch_size = 5
for i in range(0, len(df), batch_size):
    # Get the batch of tweets
    batch_tweets = df.loc[i:i + batch_size - 1, :]

    # Get sentiments using the function
    print(f"Submitting batch {i//len(batch_tweets)+1} of {len(df)//len(batch_tweets)}...")

    ids, sentiments = get_gentrification_scores(batch_tweets, batch_index=i)
    
    for idx, score in zip(ids, sentiments):
        print(f"\t{idx}: {score}")
    
    # Update the DataFrame with the predictions
    df.loc[ids, 'gentrification_prediction'] = sentiments

print("Finished")

Submitting batch 1 of 2...
Submitting batch 2 of 2...


KeyboardInterrupt: 

Index(['id', ' user-id', ' user-screen_name', ' geo-coordinates-0', ' geo-coordinates-1', ' coordinates-coordinates-0', ' coordinates-coordinates-1', ' place-full_name', ' created_at', ' text', ' Year', ' Month', ' Day', ' Hour', ' Minute', ' Second', 'gentrification_prediction'], dtype='object')

See how well that worked

In [30]:
df.loc[:,['text', 'gentrification_prediction']]

Unnamed: 0,text,gentrification_prediction
0,@johngale11 is finishing 7_200 miles Sep 17th Tower Bridge @elisecdowning @mrwaynerussell @TimMcKenna5 @rob77771955 @OCEANSUPHEAD,1.0
1,@BigPieMan1 @TheGamesDead inbox me,1.0
2,@flynn_hilary @CanavanLouise @LakelandUK of course not 😊 this is mine 😘 https://t.co/r2VbwtaPgU,1.0
3,Now on BIRSt: @Allison_Blaes brings us her episode of The School Run! #birstsfinalweekend,1.0
4,_Call me mystic Mac because I predict deese tings!_ #NotRetired https://t.co/6msIpcwscE,1.0
5,@Samanthah444Sh playing arsenal,1.0
6,I was doing so well on this diet and now recently I've gone back to how I use to be,1.0
7,Such a good evening with my girl @FayeLeHuray...... Food_ wine and a catch up. __Newley single and slaying Ibiza 2016 😉,1.0
8,@Becca_Varley I love you too chummy😘💕,1.0
9,@BAUProfileQueen aw thanks xx Hope you have a good sleep xx,1.0
