# Mood Catalog 



In [None]:
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm
import torch
import os
import tempfile  

from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig

tqdm.pandas()

FINAL_MOODS = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

EMOTION_MAP = {
    'anger': 'anger', 'annoyance': 'anger', 'disapproval': 'anger',
    'disgust': 'disgust',
    'fear': 'fear', 'nervousness': 'fear',
    'joy': 'joy', 'amusement': 'joy', 'approval': 'joy', 'excitement': 'joy',
    'gratitude': 'joy', 'love': 'joy', 'optimism': 'joy', 'relief': 'joy',
    'pride': 'joy', 'caring': 'joy', 'admiration': 'joy',
    'sadness': 'sadness', 'disappointment': 'sadness', 'grief': 'sadness',
    'remorse': 'sadness', 'embarrassment': 'sadness',
    'surprise': 'surprise', 'curiosity': 'surprise', 'confusion': 'surprise',
    'neutral': 'neutral'
}

EMOTION_CLASSIFIER = None

def load_optimized_model():
    """
    Loads a multi-label emotion model, and on the first run, performs a one-time,
    high-level optimization (ONNX export and INT8 quantization) to make it extremely fast.
    Subsequent runs will load the pre-optimized version directly from disk.

    This function now uses the modern, explicit two-step 'optimum' API:
    1. EXPORT the base Hugging Face model to the ONNX format.
    2. QUANTIZE the exported ONNX model.
    """
    global EMOTION_CLASSIFIER
    
    if EMOTION_CLASSIFIER is not None:
        print("Optimized emotion model is already loaded.")
        return

    MODEL_NAME = "joeddav/distilbert-base-uncased-go-emotions-student"
    FINAL_ONNX_PATH = "onnx_quantized_goemotions_final"
    
    try:
        if not os.path.exists(FINAL_ONNX_PATH):
            print(f"Optimized model not found at '{FINAL_ONNX_PATH}'.")
            print("--- Starting one-time optimization process (will be fast on future runs) ---")
            
            with tempfile.TemporaryDirectory() as temp_dir:
                onnx_export_path = os.path.join(temp_dir, "onnx_exported_model")
                print(f"Step 1: Exporting '{MODEL_NAME}' to ONNX format...")
                
                model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)
                tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
                
                model.save_pretrained(onnx_export_path)
                tokenizer.save_pretrained(onnx_export_path)
                print("Export complete.")
                
                print("\nStep 2: Quantizing the ONNX model to INT8 for max speed...")
                
                quantizer = ORTQuantizer.from_pretrained(onnx_export_path)
                
                qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
                
                quantizer.quantize(save_dir=FINAL_ONNX_PATH, quantization_config=qconfig)
                print(f"Quantization complete. Final model saved to '{FINAL_ONNX_PATH}'.")
        
        print(f"\nLoading fast, optimized model from '{FINAL_ONNX_PATH}'...")
        
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        provider = "CUDAExecutionProvider" if torch.cuda.is_available() else "CPUExecutionProvider"
        
        optimized_model = ORTModelForSequenceClassification.from_pretrained(FINAL_ONNX_PATH, provider=provider)
        optimized_tokenizer = AutoTokenizer.from_pretrained(FINAL_ONNX_PATH)
        
        EMOTION_CLASSIFIER = pipeline(
            "text-classification",
            model=optimized_model,
            tokenizer=optimized_tokenizer,
            top_k=None 
        )
        print(f"Optimized emotion model loaded and ready on '{device}'!")
        
    except Exception as e:
        print(f" An error occurred during the model optimization or loading process: {e}")
        print("Please ensure your environment is set up correctly with all dependencies (`tensorflow`, `tf-keras`, etc.)")
        raise

def get_mood_signature_optimized(texts: list[str]) -> list[np.ndarray]:
    """
    Processes a BATCH of texts using the highly accurate and optimized model.
    This function takes a list of strings and returns a list of mood vectors.
    """
    if not texts:
        return []

    if EMOTION_CLASSIFIER is None:
        raise RuntimeError("The emotion classifier model has not been loaded. Please run load_optimized_model() first.")

    model_outputs = EMOTION_CLASSIFIER(texts, truncation=True, max_length=256)
    
    final_signatures = []
    for output in model_outputs:
        scores = {mood: 0.0 for mood in FINAL_MOODS}
        
        for label_pred in output:
            model_label = label_pred['label']
            if model_label in EMOTION_MAP:
                target_mood = EMOTION_MAP[model_label]
                scores[target_mood] = max(scores[target_mood], label_pred['score'])
                
        vec = np.array(list(scores.values()))
        final_signatures.append(softmax(vec))
        
    return final_signatures

def softmax(x: np.ndarray) -> np.ndarray:
    """A numerically stable softmax function."""
    if x.sum() == 0:
        return np.full_like(x, 1.0 / len(x))
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


print("--- Initializing Model ---")
load_optimized_model()

#movies
MOVIE_PATH = '/home/ujjwal/Desktop/Work/aws_catalyst_openrec_v2/final/ml-25m'
movies_df = pd.read_csv(f"{MOVIE_PATH}/movies.csv")
tags_df = pd.read_csv(f"{MOVIE_PATH}/tags.csv")
tags_df.dropna(subset=['tag'], inplace=True)
tags_df['tag'] = tags_df['tag'].astype(str).str.lower()
agg = tags_df.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()
movies = pd.merge(movies_df, agg, on='movieId', how='left')
movies['tag'].fillna('', inplace=True)
movies['text_for_nlp'] = "Genres: " + movies['genres'] + ". Tags: " + movies['tag']
movies.rename(columns={'movieId':'item_id','title':'name'}, inplace=True)
movies['creator'] = 'Unknown'
movies['type'] = 'movie'
movies_final_df = movies[['item_id','type','name','creator','text_for_nlp']]
print("Movies done.")


#books
BOOK_PATH = '/home/ujjwal/Desktop/Work/aws_catalyst_openrec_v2/final/goodreads'
files = glob.glob(f"{BOOK_PATH}/book*.csv")
dfs = [pd.read_csv(f) for f in files]
books = pd.concat(dfs, ignore_index=True)
books.rename(columns={'Id':'item_id','Name':'name', 'Authors':'creator','Description':'description'}, inplace=True)
books['description'] = books['description'].fillna('')
books['creator'] = books['creator'].fillna('Unknown')
books['description'] = books['description'].apply(lambda x: ' '.join(x.split()[:300]))
books['text_for_nlp'] = "Description: " + books['description']
books['type'] = 'book'
books_final_df = books[['item_id','type','name','creator','text_for_nlp']]
print("Books done.")


#music
MUSIC_PATH = '/home/ujjwal/Desktop/Work/aws_catalyst_openrec_v2/final/million song'
music = pd.read_csv(f"{MUSIC_PATH}/Music Info.csv")
music.rename(columns={'track_id':'item_id','name':'name','artist':'creator'}, inplace=True)
music['creator'] = music['creator'].fillna('Unknown')
music['text_for_nlp'] = "Artist: " + music['creator'] + ". Title: " + music['name']
music['type'] = 'music'
music_final_df = music[['item_id','type','name','creator','text_for_nlp']]
print("Music done.")

#combine all
master_df = pd.concat([movies_final_df, books_final_df, music_final_df], ignore_index=True)
master_df.dropna(subset=['item_id','name'], inplace=True)
master_df['item_id'] = master_df['item_id'].astype(str)
print(f"Loaded {len(master_df)} total items for processing.")


batch_size = 128 if torch.cuda.is_available() else 32
all_signatures = []
texts_to_process = master_df['text_for_nlp'].fillna('').tolist()

for i in tqdm(range(0, len(texts_to_process), batch_size)):
    batch_texts = texts_to_process[i:i + batch_size]
    batch_signatures = get_mood_signature_optimized(batch_texts)
    all_signatures.extend(batch_signatures)


mood_df = pd.DataFrame(all_signatures, columns=FINAL_MOODS, index=master_df.index)
final_df = pd.concat([master_df, mood_df], axis=1)
final_df.drop(columns=['text_for_nlp'], inplace=True)

output_filename = 'master_catalog_with_moods_OPTIMIZED.csv'
final_df.to_csv(output_filename, index=False)

print(f"\nSuccess! Master catalog with 7-dimensional moods saved to '{output_filename}'")

print("\n--- Sample of Final Data ---")
display(final_df.sample(10))

--- Initializing AI Model ---

Loading fast, optimized model from 'onnx_quantized_goemotions_final'...


[0;93m2025-06-20 03:12:09.301957536 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 156 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-06-20 03:12:09.313911276 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-06-20 03:12:09.313928388 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
Device set to use cuda:0


✅🚀 Optimized emotion model loaded and ready on 'cuda:0'!

--- Movie Data Processing ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['tag'].fillna('', inplace=True)


✅ Movies done.

--- Book Data Processing ---
✅ Books done.

--- Music Data Processing ---
✅ Music done.

--- Building Master Catalog ---
Loaded 1963416 total items for processing.

--- Generating Mood Signatures using Optimized AI Model ---


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 15340/15340 [12:34:50<00:00,  2.95s/it]



--- Finalizing and Saving Results ---

🚀 Success! Master catalog with 7-dimensional moods saved to 'master_catalog_with_moods_OPTIMIZED.csv'

--- Sample of Final Data ---


Unnamed: 0,item_id,type,name,creator,anger,disgust,fear,joy,neutral,sadness,surprise
1206572,1840188,book,"It's a Fun Job, But Someone Has to Do It - Ano...",Al Hunter,0.139912,0.136721,0.137114,0.160526,0.140595,0.141763,0.143368
1227750,1890196,book,"Rudolph, the Red-Nosed Reindeer",Elizabeth Encarnacion,0.139148,0.137621,0.138413,0.154816,0.140273,0.144082,0.145648
698890,4351487,book,Process Discipline: How To Maximize Profitabil...,Carole L. Bennett,0.143513,0.138065,0.137369,0.157362,0.140873,0.141208,0.141611
83795,1949467,book,Blood That Cries Out from the Earth: The Psych...,James W. Jones,0.143716,0.143779,0.140023,0.14513,0.137815,0.144684,0.144853
282329,75958,book,Green Tea and Other Strange Tales,J. Sheridan Le Fanu,0.141567,0.141516,0.14262,0.144771,0.143291,0.141839,0.144396
896796,981523,book,"Last Mogul, The: Lew Wasserman, MCA and the Hi...",Dennis McDougal,0.14374,0.140448,0.147219,0.144983,0.138084,0.142667,0.142859
1917462,TRTTBED128E0782827,music,Be My Angel,Mazzy Star,0.140227,0.139505,0.140556,0.155074,0.140876,0.140577,0.143186
1715022,2992279,book,Revised Customer Satisfaction: The Other Half ...,Dru Scott Decker,0.141382,0.13981,0.140057,0.150151,0.141028,0.14214,0.145433
677695,4287012,book,Applications And Theory Of Petri Nets 2003: 24...,Wil van der Aalst,0.141274,0.137726,0.138587,0.155891,0.140628,0.143171,0.142722
1067970,3373463,book,Being Interior: Autobiography and the Contradi...,Nicholas D. Paige,0.141406,0.138015,0.138221,0.14939,0.140886,0.144803,0.147278


In [None]:
movies_final_df


--- Movie Data Processing ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['tag'].fillna('', inplace=True)


Unnamed: 0,item_id,type,name,creator,text_for_nlp
0,1,movie,Toy Story (1995),Unknown,Genres: Adventure|Animation|Children|Comedy|Fa...
1,2,movie,Jumanji (1995),Unknown,Genres: Adventure|Children|Fantasy. Tags: robi...
2,3,movie,Grumpier Old Men (1995),Unknown,"Genres: Comedy|Romance. Tags: funny, best frie..."
3,4,movie,Waiting to Exhale (1995),Unknown,Genres: Comedy|Drama|Romance. Tags: based on n...
4,5,movie,Father of the Bride Part II (1995),Unknown,"Genres: Comedy. Tags: aging, baby, confidence,..."
...,...,...,...,...,...
62418,209157,movie,We (2018),Unknown,Genres: Drama. Tags:
62419,209159,movie,Window of the Soul (2001),Unknown,Genres: Documentary. Tags:
62420,209163,movie,Bad Poems (2018),Unknown,Genres: Comedy|Drama. Tags:
62421,209169,movie,A Girl Thing (2001),Unknown,Genres: (no genres listed). Tags:


In [None]:
music_final_df



--- Music Data Processing ---


Unnamed: 0,item_id,type,name,creator,text_for_nlp
0,TRIOREW128F424EAF0,music,Mr. Brightside,The Killers,Artist: The Killers. Title: Mr. Brightside
1,TRRIVDJ128F429B0E8,music,Wonderwall,Oasis,Artist: Oasis. Title: Wonderwall
2,TROUVHL128F426C441,music,Come as You Are,Nirvana,Artist: Nirvana. Title: Come as You Are
3,TRUEIND128F93038C4,music,Take Me Out,Franz Ferdinand,Artist: Franz Ferdinand. Title: Take Me Out
4,TRLNZBD128F935E4D8,music,Creep,Radiohead,Artist: Radiohead. Title: Creep
...,...,...,...,...,...
50678,TRQYCFV128F9322F50,music,Ryusei Rocket,アンティック-珈琲店-,Artist: アンティック-珈琲店-. Title: Ryusei Rocket
50679,TRHQCSH128F42724B7,music,Colors Of The Wind,ACIDMAN,Artist: ACIDMAN. Title: Colors Of The Wind
50680,TRZRODK128F92D68D7,music,The Revelation,coldrain,Artist: coldrain. Title: The Revelation
50681,TRGLMEM128F9322F63,music,Koi no Dependence,アンティック-珈琲店-,Artist: アンティック-珈琲店-. Title: Koi no Dependence


In [None]:
books_final_df




--- Book Data Processing ---


Unnamed: 0,item_id,type,name,creator,text_for_nlp
0,1900511,book,Barbarossa,Christopher Ailsby,"Description: On 22 June 1941, Adolf Hitler lau..."
1,1900512,book,Collector's Guide to German World War II: Comb...,Christopher Ailsby,Description:
2,1900514,book,Images of Barbarossa,Christopher Ailsby,"Description: On 22 June 1941, Adolf Hitler lau..."
3,1900520,book,Romania After 2000: Five New Romanian Plays,Daniel Charles Gerould,Description: The first anthology of new Romani...
4,1900521,book,Global Foreigners: An Anthology of Plays,Saviana Stănescu,"Description: In Waxing West, Daniella, newly a..."
...,...,...,...,...,...
1850305,899983,book,"How to Prepare for the GED, Canadian Edition",Murray Rockowitz,Description: Canadian men and women preparing ...
1850306,899984,book,How to Prepare for the GED High School Equival...,Murray Rockowitz,Description: Updated to reflect the latest GED...
1850307,899986,book,Contemporary's New GED: How to Prepare for the...,"Contemporary Books, Inc.",Description:
1850308,899992,book,Hiking Michigan's Upper Peninsula,Eric Hansen,Description: Describes fifty of the best hikes...
