# LLM Experiments

Early experiments with LLMs

## Libraries

In [1]:
import os
import pandas as pd

from kaggle.api.kaggle_api_extended import KaggleApi
from together import Together


## Get test data

Download the Kaggle Sentiment140 dataset (a load of tweets with sentiment; useful for experimenting).

In [2]:
data_dir = '../data/tweet_data'
dataset = 'kazanova/sentiment140'

# Function to download dataset if not already downloaded
def download_kaggle_dataset(dataset, data_dir):
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Check if dataset already exists
    dataset_files = os.listdir(data_dir)
    if not dataset_files:
        api = KaggleApi()
        api.authenticate()
        api.dataset_download_files(dataset, path=data_dir, unzip=True)
        print("Dataset downloaded and extracted.")
    else:
        print("Dataset already exists in the directory.")

# Run the function to download the dataset
download_kaggle_dataset(dataset, data_dir)

tweets_df = pd.read_csv(os.path.join(data_dir, 'training.1600000.processed.noemoticon.csv'),
                        header=None,
                        names=["polarity", "id", "date", "query", "user", "text"],
                        dtype={"polarity": int, "id": int, "date": str, "query": str, "user": str, "text": str},
                        encoding='latin1')

tweets_df

Dataset already exists in the directory.


Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


## Example code to access Together.AI API. 

This loops over tweets individually and asks a llama more to clasify them

In [13]:
# Import the Together library
from together import Together

# Get the API key from a file
with open('together.ai_key.txt', 'r') as f:
    api_key = f.readline().strip()

client = Together(api_key=api_key)

# List of tweets to classify
tweets = [
    "Played pub for the first time in a long time and loved it. Will post clips tonight!",
    "I hate when it rains all day.",
    "It's an average day, nothing special happening.",
    # Add more tweets as needed
]

# Loop through each tweet and classify its sentiment
for tweet in tweets:
    response = client.chat.completions.create(
        model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
        # model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",

        messages=[
            {
                "role": "system",
                "content": "Classify the sentiment (positive, negative, or neutral) of the following text. Reply with one word."
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": tweet
                    }
                ]
            },
        ],
        max_tokens=5,
        temperature=0.7,
        top_p=0.7,
        top_k=50,
        repetition_penalty=1,
        stop=["<|eot_id|>", "<|eom_id|>"],
        truncate=130560,
        stream=False  # Set stream to False to get the full response
    )

    # Extract the assistant's reply
    sentiment = response.choices[0].message.content.strip()

    print(f"Tweet: {tweet}")
    print(f"Sentiment: {sentiment}")
    print()

Tweet: Played pub for the first time in a long time and loved it. Will post clips tonight!
Sentiment: Positive

Tweet: I hate when it rains all day.
Sentiment: Negative

Tweet: It's an average day, nothing special happening.
Sentiment: Neutral



This does them in batches (not _tested_)

In [15]:
# Import the Together library
from together import Together

# Get the API key from a file
with open('together.ai_key.txt', 'r') as f:
    api_key = f.readline().strip()

client = Together(api_key=api_key)

# List of tweets to classify
tweets = [
    "Played pub for the first time in a long time and loved it. Will post clips tonight!",
    "I hate when it rains all day.",
    "It's an average day, nothing special happening.",
    # Add more tweets as needed
]

# Construct the prompt by numbering each tweet
tweet_list = "\n".join([f"{idx+1}. {tweet}" for idx, tweet in enumerate(tweets)])

# Create the system prompt
system_prompt = (
    "Classify the sentiment (positive, negative, or neutral) of each of the following texts. "
    "Provide your answer in the format '1. Sentiment', '2. Sentiment', etc.\n\n"
    f"{tweet_list}"
)

# Prepare the messages
messages = [
    {
        "role": "system",
        "content": system_prompt
    }
]

# Call the API
response = client.chat.completions.create(
    model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
    messages=messages,
    max_tokens=150,
    temperature=0.7,
    top_p=0.7,
    top_k=50,
    repetition_penalty=1,
    stop=["<|eot_id|>", "<|eom_id|>"],
    truncate=130560,
    stream=False  # Set stream to False to get the full response
)

# Extract the assistant's reply
assistant_reply = response.choices[0].message.content.strip()

# Print the assistant's reply
print("Assistant's Reply:")
print(assistant_reply)

# Parse the assistant's reply to map sentiments to tweets
import re

# Create a dictionary to hold the sentiments
sentiments = {}

# Use regular expressions to extract the sentiments
matches = re.findall(r"(\d+)\.\s*(Positive|Negative|Neutral)", assistant_reply, re.IGNORECASE)

for match in matches:
    idx, sentiment = match
    sentiments[int(idx)-1] = sentiment.capitalize()

# Print the results
for idx, tweet in enumerate(tweets):
    sentiment = sentiments.get(idx, "Not Classified")
    print(f"Tweet: {tweet}")
    print(f"Sentiment: {sentiment}")
    print()

Assistant's Reply:
Here are the sentiment classifications:

1. Positive
2. Negative
3. Neutral
Tweet: Played pub for the first time in a long time and loved it. Will post clips tonight!
Sentiment: Positive

Tweet: I hate when it rains all day.
Sentiment: Negative

Tweet: It's an average day, nothing special happening.
Sentiment: Neutral

