# LLM Experiments

Early experiments with LLMs

## Libraries

In [1]:
import os
import re

import numpy as np
import pandas as pd

from kaggle.api.kaggle_api_extended import KaggleApi
from together import Together

# Easier display options for debugging: 

# Set the display width to a larger value
pd.set_option('display.width', 1000)

# Optionally, set the max column width to avoid truncating column data
pd.set_option('display.max_colwidth', None)

# Optionally, set the max number of columns to show all columns
pd.set_option('display.max_columns', None)

## Get test data

Download the [Kaggle Sentiment140](https://www.kaggle.com/datasets/kazanova/sentiment140) dataset (a load of tweets with sentiment; useful for experimenting). Polarity is 0 for negative, 2 for neutral and 4 for positive.

In [2]:
data_dir = '../data/tweet_data'
dataset = 'kazanova/sentiment140'

# Function to download dataset if not already downloaded
def download_kaggle_dataset(dataset, data_dir):
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Check if dataset already exists
    dataset_files = os.listdir(data_dir)
    if not dataset_files:
        api = KaggleApi()
        api.authenticate()
        api.dataset_download_files(dataset, path=data_dir, unzip=True)
        print("Dataset downloaded and extracted.")
    else:
        print("Dataset already exists in the directory.")

# Run the function to download the dataset
download_kaggle_dataset(dataset, data_dir)

tweets_df = pd.read_csv(os.path.join(data_dir, 'training.1600000.processed.noemoticon.csv'),
                        header=None,
                        names=["polarity", "id", "date", "query", "user", "text"],
                        dtype={"polarity": int, "id": int, "date": str, "query": str, "user": str, "text": str},
                        encoding='latin1',
                        index_col=False  # Ensure the df is given an ascending, consecutive index
                        )

tweets_df

Dataset already exists in the directory.


Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interviews! â« http://blip.fm/~8bmta
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me for details
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur


## Use the Together.AI API to batch classify them. 

In [3]:
# Get the API key from a file
with open('together.ai_key.txt', 'r') as f:
    api_key = f.readline().strip()

client = Together(api_key=api_key)

# List of tweets to classify (only a few for now)
df = tweets_df.sample(200).copy()
# Ensure the index is consecutive and ascending
df = df.reset_index(drop=True)
# To store the results
df['sentiment'] = np.nan

# Whole thing needs to be nested in a for loop batches the tweets
batch_size = 20 
for i in range(0, len(df), batch_size):
    # Get the list of tweets in this batch
    batch_tweets = df.loc[i:i+batch_size,:]
    tweet_list = "\n".join([f"{idx+1}. {tweet}" 
                            for idx, tweet in enumerate(batch_tweets.text.values)])
    print(tweet_list)

    # Create the system prompt
    system_prompt = (
        "Classify the sentiment (positive, negative, or neutral) of each of the following texts. "
        "Provide your answer in the format '1. Sentiment', '2. Sentiment', etc.\n\n"
        f"{tweet_list}"
    )

    # Prepare the messages
    messages = [
        {
            "role": "system",
            "content": system_prompt
        }
    ]
    
    print(f"Submitting batch {i//batch_size+1} of {len(df)//batch_size+1}...",)
    # Call the API (docs here: https://docs.together.ai/reference/chat-completions-1)
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
        messages=messages,
        max_tokens=300,
        temperature=0.7,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>", "<|eom_id|>"],
        truncate=130560,
        stream=False  # Set stream to False to get the full response
    )

    # Extract the assistant's reply
    assistant_reply = response.choices[0].message.content.strip()
    print("... reply received.")

    ## Print the assistant's reply
    #print("Assistant's Reply:")
    #print(assistant_reply)

    # Use regular expressions to extract the sentiments
    matches = re.findall(r"(\d+)\.\s*(Positive|Negative|Neutral)", assistant_reply, re.IGNORECASE)

    # Check that the numbering is correct
    # TODO
    
    # Put those estimates back into the df via two lists (one for the index, one for the sentiment)
    # (remember to convert the sentiment id back to the dataframe index)
    ids = [i+(int(idx)-1)  for (idx, sentiment) in matches]
    sentiments = [sentiment.capitalize() for (idx, sentiment) in matches]
    assert len(ids) == len(sentiments)
    
    # Put those estimates back into the df
    df.loc[ids, 'sentiment'] = sentiments
    

    ## Print the results
    #for idx, tweet in enumerate(tweets):
    #    sentiment = sentiments.get(idx, "Not Classified")
    #    print(f"Tweet: {tweet}")
    #    print(f"Sentiment: {sentiment}")
    #    print()

1. Downtown mpls...oh its been a while! How you have changed 
2. Watching Notorious for the third time... I wish biggie was still around 
3. is wacthing alizee en concert 2004 
4. Finishing new media class at UCLA so this is an assignment... 
5. Is going underground so I'm unable to tweet, I will resume when I'm overground! 
6. Hello  I like twitter much of the time, but I also like that you don't like it.
7. Drinking a margarita &amp; watching the game...come on Lakers! PS...I'm S0 not ready to go home 
8. Buying a mothers day card at 2am the night before is a Baddd idea. Relegated to cards addressed to &quot;Tia&quot; and &quot;Ama&quot; 
9. I missed the call from my boo.  Sad face 
10. @kaboro  you know my and my love for all things soil
11. @amberpatrick i've been checking your tweets for updates often, my friend...you so seem to be a pinch tweetless...so not like you 
12. ouchhh my sunburn stings like a bitch! 
13. loving the summer feeling!!! still cant believe it 
14. @jaysonstr

In [4]:
df

Unnamed: 0,polarity,id,date,query,user,text,sentiment
0,0,1956260377,Thu May 28 21:30:18 PDT 2009,NO_QUERY,reddog187,Downtown mpls...oh its been a while! How you have changed,Neutral
1,0,2202639245,Tue Jun 16 22:08:47 PDT 2009,NO_QUERY,ddlovato,Watching Notorious for the third time... I wish biggie was still around,Positive
2,4,2189877215,Tue Jun 16 01:08:45 PDT 2009,NO_QUERY,alizeefans,is wacthing alizee en concert 2004,Neutral
3,4,2059431404,Sat Jun 06 16:55:21 PDT 2009,NO_QUERY,DiLaCa,Finishing new media class at UCLA so this is an assignment...,Neutral
4,4,2016121389,Wed Jun 03 06:12:23 PDT 2009,NO_QUERY,its_sb,"Is going underground so I'm unable to tweet, I will resume when I'm overground!",Neutral
...,...,...,...,...,...,...,...
195,0,2236722706,Fri Jun 19 04:23:20 PDT 2009,NO_QUERY,Evita_Galore,Don't have the Job at Beaty Shots,Negative
196,0,2203480910,Tue Jun 16 23:50:10 PDT 2009,NO_QUERY,nrgins,looking for excuses to not work. running out of them.....,Negative
197,4,2188266095,Mon Jun 15 21:35:12 PDT 2009,NO_QUERY,PICKCHICK,@hxcbunnee I vote for Grapes of Wrath! Have a great night! Pick Chick,Positive
198,0,1752156622,Sat May 09 21:11:03 PDT 2009,NO_QUERY,DataVonTeese,@stark23x I's have the HARDEST time finding a bathroom in Deafsville,Neutral


See how well that worked

In [5]:
def check_sentiment_estimate(row):
    if row.polarity == 0 and row.sentiment == "Negative":
        return 1
    elif row.polarity == 2 and row.sentiment == "Neutral":
        return 1
    elif row.polarity == 4 and row.sentiment == "Positive":
        return 1
    else:
        return 0

df['correct'] = df.apply(check_sentiment_estimate, axis=1)
print("Accuracy:", df.correct.mean())
df.loc[:,['text', 'polarity', 'sentiment', 'correct']]


Accuracy: 0.57


Unnamed: 0,text,polarity,sentiment,correct
0,Downtown mpls...oh its been a while! How you have changed,0,Neutral,0
1,Watching Notorious for the third time... I wish biggie was still around,0,Positive,0
2,is wacthing alizee en concert 2004,4,Neutral,0
3,Finishing new media class at UCLA so this is an assignment...,4,Neutral,0
4,"Is going underground so I'm unable to tweet, I will resume when I'm overground!",4,Neutral,0
...,...,...,...,...
195,Don't have the Job at Beaty Shots,0,Negative,1
196,looking for excuses to not work. running out of them.....,0,Negative,1
197,@hxcbunnee I vote for Grapes of Wrath! Have a great night! Pick Chick,4,Positive,1
198,@stark23x I's have the HARDEST time finding a bathroom in Deafsville,0,Neutral,0
