## Classification of emotions in the Go emotions dataset


In [1]:
#imports
import pandas as pd
import numpy  as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
import os

from langchain_ibm import WatsonxLLM
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

In [2]:
from typing import Literal, Any
from copy import deepcopy

from typing_extensions import TypedDict

import matplotlib.pyplot as plt
import numpy as np
from pydantic import BaseModel, Field
from IPython.display import Image, display
from tqdm import tqdm

from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate
from langchain_ibm import WatsonxEmbeddings
from langchain_ibm import WatsonxLLM
from langgraph.graph import START, StateGraph
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams


In [None]:
import litellm
from litellm import completion

##import instructor
##from instructor import Mode

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1980: character maps to <undefined>

Downloading the dataset

In [10]:
# Create a directory for the data
data_dir = Path('data')
data_dir.mkdir(parents=True, exist_ok=True)


In [4]:


#URLs for the Go emotions dataset files
urls = {
    'goemotions_1': 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv',
    'goemotions_2': 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv',
    'goemotions_3': 'https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv'
}

# Download each file
for name, url in urls.items():
    target_path = data_dir / f"{name}.csv"
    
    if not target_path.exists():
        print(f"Downloading {name} dataset...")
        response = requests.get(url)
        
        if response.status_code == 200:
            with open(target_path, 'wb') as f:
                f.write(response.content)
            print(f"Successfully downloaded {name} dataset to {target_path}")
        else:
            print(f"Failed to download {name} dataset. Status code: {response.status_code}")
    else:
        print(f"{name} dataset already exists at {target_path}")

Downloading goemotions_1 dataset...
Successfully downloaded goemotions_1 dataset to data\full_dataset\goemotions_1.csv
Downloading goemotions_2 dataset...
Successfully downloaded goemotions_2 dataset to data\full_dataset\goemotions_2.csv
Downloading goemotions_3 dataset...
Successfully downloaded goemotions_3 dataset to data\full_dataset\goemotions_3.csv


loading the data and combining the datasets

In [20]:
# Load the goemotions_1.csv file into a pandas DataFrame
goemotions_1_df = pd.read_csv("data/goemotions1.csv")
goemotions_2_df = pd.read_csv("data/goemotions2.csv")
goemotions_3_df = pd.read_csv("data/goemotions3.csv")


# Combine the datasets into a single DataFrame
combined_df = pd.concat([goemotions_1_df, goemotions_2_df, goemotions_3_df], ignore_index=True)



data preprocessing and cleaning

In [25]:
# Print a nicely formatted list of all features in combined_df
print("Features in combined_df:")
for feature in combined_df.columns:
    print(f"- {feature}")

Features in combined_df:
- text
- id
- admiration
- amusement
- anger
- annoyance
- approval
- caring
- confusion
- curiosity
- desire
- disappointment
- disapproval
- disgust
- embarrassment
- excitement
- fear
- gratitude
- grief
- joy
- love
- nervousness
- optimism
- pride
- realization
- relief
- remorse
- sadness
- surprise
- neutral


Removal of features with no predictive value

In [None]:
# Remove the specified features from combined_df
features_to_remove = ['author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear']
combined_df = combined_df.drop(columns=features_to_remove)

#remove duplicates
#full obeservation duplicates  
combined_df = combined_df.drop_duplicates()
#text duplicates
combined_df = combined_df.drop_duplicates(subset='text')


Checking new Dataframe 

In [38]:
#cheking to see if the features are removed or not
print("Features in combined_df:")
for feature in combined_df.columns:
    print(f"- {feature}")

#see shape of combined_df
print("Shape of combined_df:", combined_df.shape)

Features in combined_df:
- text
- id
- admiration
- amusement
- anger
- annoyance
- approval
- caring
- confusion
- curiosity
- desire
- disappointment
- disapproval
- disgust
- embarrassment
- excitement
- fear
- gratitude
- grief
- joy
- love
- nervousness
- optimism
- pride
- realization
- relief
- remorse
- sadness
- surprise
- neutral
Shape of combined_df: (57732, 30)


Splitting the dataset into train test & validation

In [39]:

# Split the data into train (80%) and temp (20%)
train_df, temp_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Split the temp data into validation (50% of temp) and test (50% of temp)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print the sizes of the splits
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(validation_df)}")
print(f"Test size: {len(test_df)}")

Train size: 46185
Validation size: 5773
Test size: 5774


## building the models

In [59]:
# Load environment variables from the .env file
load_dotenv()

# Print the current working directory
print(f"Current working directory: {os.getcwd()}")

# Check if .env file exists
env_path = os.path.join(os.getcwd(), '.env')
print(f".env file exists: {os.path.exists(env_path)}")

# Try to read the .env file directly to see its contents
try:
    with open('.env', 'r') as f:
        env_contents = f.read()
        print("Contents of .env file:")
        print(env_contents)
        
        # Parse the .env file and set variables
        for line in env_contents.splitlines():
            if '=' in line:
                key, value = line.split('=', 1)
                key = key.strip()
                value = value.strip().strip('"')
                globals()[key] = value  # Dynamically create variables
except FileNotFoundError:
    print(".env file not found. Make sure it's in the correct location.")

Current working directory: c:\Users\Valdemar Schultz\Desktop\AI-ML Eksamen
.env file exists: True
Contents of .env file:
WX_PROJECT_ID_RAG="5b56ca80-258f-4dca-b3b9-413dd0495235"
WX_API_KEY="vKoe2CuO-u0WmIydpGGMtvzgAKb_1jIqq7LeAIWSwcXa"
WX_URL="https://us-south.ml.cloud.ibm.com"




In [61]:
print("WX_API_KEY:", WX_API_KEY)
print("WX_PROJECT_ID_RAG:", WX_PROJECT_ID_RAG)
print("WX_URL:", WX_URL)

WX_API_KEY: vKoe2CuO-u0WmIydpGGMtvzgAKb_1jIqq7LeAIWSwcXa
WX_PROJECT_ID_RAG: 5b56ca80-258f-4dca-b3b9-413dd0495235
WX_URL: https://us-south.ml.cloud.ibm.com
