## Classification of emotions in the Go emotions dataset


In [1]:
#imports
import pandas as pd
import numpy  as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
import os

from langchain_ibm import WatsonxLLM
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

In [2]:
from typing import Literal, Any
from copy import deepcopy

from typing_extensions import TypedDict

import matplotlib.pyplot as plt
import numpy as np
from pydantic import BaseModel, Field
from IPython.display import Image, display
from tqdm import tqdm

from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate
from langchain_ibm import WatsonxEmbeddings
from langchain_ibm import WatsonxLLM
from langgraph.graph import START, StateGraph
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams


In [3]:
import litellm
from litellm import completion

import instructor
from instructor import Mode

Downloading the dataset

In [4]:
# Create a directory for the data
data_dir = Path('data')
data_dir.mkdir(parents=True, exist_ok=True)


In [5]:


#URLs for the Go emotions dataset files
urls = {
    'goemotions_1': 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv',
    'goemotions_2': 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv',
    'goemotions_3': 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv'
}

# Download each file
for name, url in urls.items():
    target_path = data_dir / f"{name}.csv"
    
    if not target_path.exists():
        print(f"Downloading {name} dataset...")
        response = requests.get(url)
        
        if response.status_code == 200:
            with open(target_path, 'wb') as f:
                f.write(response.content)
            print(f"Successfully downloaded {name} dataset to {target_path}")
        else:
            print(f"Failed to download {name} dataset. Status code: {response.status_code}")
    else:
        print(f"{name} dataset already exists at {target_path}")

Downloading goemotions_1 dataset...


NameError: name 'requests' is not defined

loading the data and combining the datasets

In [6]:
# Load the goemotions_1.csv file into a pandas DataFrame
goemotions_1_df = pd.read_csv("data/goemotions1.csv")
goemotions_2_df = pd.read_csv("data/goemotions2.csv")
goemotions_3_df = pd.read_csv("data/goemotions3.csv")


# Combine the datasets into a single DataFrame
combined_df = pd.concat([goemotions_1_df, goemotions_2_df, goemotions_3_df], ignore_index=True)



data preprocessing and cleaning

In [7]:
# Print a nicely formatted list of all features in combined_df
print("Features in combined_df:")
for feature in combined_df.columns:
    print(f"- {feature}")

Features in combined_df:
- text
- id
- author
- subreddit
- link_id
- parent_id
- created_utc
- rater_id
- example_very_unclear
- admiration
- amusement
- anger
- annoyance
- approval
- caring
- confusion
- curiosity
- desire
- disappointment
- disapproval
- disgust
- embarrassment
- excitement
- fear
- gratitude
- grief
- joy
- love
- nervousness
- optimism
- pride
- realization
- relief
- remorse
- sadness
- surprise
- neutral


Removal of features with no predictive value

In [8]:
# Remove the specified features from combined_df
features_to_remove = ['author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear']
combined_df = combined_df.drop(columns=features_to_remove)

#remove duplicates
#full obeservation duplicates  
combined_df = combined_df.drop_duplicates()
#text duplicates
combined_df = combined_df.drop_duplicates(subset='text')


Checking new Dataframe 

In [9]:
#cheking to see if the features are removed or not
print("Features in combined_df:")
for feature in combined_df.columns:
    print(f"- {feature}")

#see shape of combined_df
print("Shape of combined_df:", combined_df.shape)

Features in combined_df:
- text
- id
- admiration
- amusement
- anger
- annoyance
- approval
- caring
- confusion
- curiosity
- desire
- disappointment
- disapproval
- disgust
- embarrassment
- excitement
- fear
- gratitude
- grief
- joy
- love
- nervousness
- optimism
- pride
- realization
- relief
- remorse
- sadness
- surprise
- neutral
Shape of combined_df: (57732, 30)


Splitting the dataset into train test & validation

In [10]:

# Split the data into train (80%) and temp (20%)
train_df, temp_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Split the temp data into validation (50% of temp) and test (50% of temp)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print the sizes of the splits
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(validation_df)}")
print(f"Test size: {len(test_df)}")

Train size: 46185
Validation size: 5773
Test size: 5774


## building the models

Connecting to WatsonX

In [11]:
WX_API_KEY = os.getenv("WX_API_KEY")
WX_PROJECT_ID_RAG = os.getenv("WX_PROJECT_ID_RAG")
WX_URL = os.getenv("WX_URL")



In [12]:
llm = WatsonxLLM(

        model_id="mistralai/mistral-large",
        url=WX_URL,
        apikey=WX_API_KEY,
        project_id=WX_PROJECT_ID_RAG,
        username="makr21ag@student.cbs.dk",

        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.TEMPERATURE: 0,
            GenParams.MIN_NEW_TOKENS: 5,
            GenParams.MAX_NEW_TOKENS: 1_000,
            GenParams.REPETITION_PENALTY: 1.2
        }

)

Testing the connection

In [13]:
for chunk in llm.stream(
    "Describe your favorite breed of dog and why it is your favorite."
):
    print(chunk, end="")



I love dogs, but I don’t have a particular favourite breed because they are all so different in their own way! However, if you were to ask me what my dream pet would be (if money was no object), then I think that the answer might surprise some people: an Alaskan Malamute or Siberian Husky puppy – preferably one with blue eyes like those seen on Game Of Thrones 😉 These two types of huskies look very similar at first glance; however there are subtle differences between them which make each unique from another type such as size/weight etcetera… But both share many traits including being friendly towards humans while also having strong pack instincts making these animals great companions for families who want something more than just “man’s best friend”. They require lots exercise though due to high energy levels so owners need plenty space outdoors where possible too keep fit themselves during walks together every day otherwise could become destructive indoors instead!

We're saving our training split as a csv-file

In [14]:
# Save the training split into a CSV file in the data folder
train_file_path = data_dir / "train_split.csv"
train_df.to_csv(train_file_path, index=False)
print(f"Training split saved to {train_file_path}")

Training split saved to data/train_split.csv


In [15]:
from langchain.document_loaders import CSVLoader

# point at your CSV file; it will emit one Document per row,
# with page_content = the concatenated row and metadata = the rest of the columns
loader = CSVLoader(
    file_path=data_dir / "train_split.csv",
    encoding="utf-8",
    csv_args={"delimiter": ","},
)
docs = loader.load()

print(docs[0].page_content)
# each doc.page_content is one row’s text, and doc.metadata contains the 28 emotion flags


text: lol. good one dude. you really showed me. great argument. really showing off your intellectual prowess. keep up the good work!
id: eevpt1e
admiration: 0
amusement: 1
anger: 0
annoyance: 0
approval: 0
caring: 0
confusion: 0
curiosity: 0
desire: 0
disappointment: 0
disapproval: 0
disgust: 0
embarrassment: 0
excitement: 0
fear: 0
gratitude: 0
grief: 0
joy: 0
love: 0
nervousness: 0
optimism: 0
pride: 0
realization: 0
relief: 0
remorse: 0
sadness: 0
surprise: 0
neutral: 0
