# pipeline for emoji analysis using only emoji embeddings 

In [2]:
import gensim
import numpy as np
import pickle
import emoji

# Load trained RandomForestRegressor model
with open("model/emoji_sentiment_RFR_300d.pkl", "rb") as f:
    model = pickle.load(f)

# Load Emoji2Vec model
emoji2vec_path = "emoji2vec/pre-trained/emoji2vec.bin"  # Change to your actual path
emoji_model = gensim.models.KeyedVectors.load_word2vec_format(emoji2vec_path, binary=True)


In [3]:
def extract_emojis(text):
    """Extract all emojis from text"""
    return ''.join(c for c in text if c in emoji.EMOJI_DATA)

def get_emoji_embedding(emoji_char):
    """Extract embedding for a given emoji"""
    try:
        return emoji_model[emoji_char]
    except KeyError:
        return np.zeros(300)  # Return zero vector if emoji not found


In [4]:
def predict_sentiment(user_input):
    """Predict sentiment score based on user input containing only emojis"""
    
    # Extract emojis from input
    extracted_emojis = extract_emojis(user_input)
    
    # Handle case when no emoji is found
    if not extracted_emojis:
        return "No emojis detected! Please enter a sentence with emojis."

    # Get emoji embeddings
    emoji_embeddings = np.mean([get_emoji_embedding(e) for e in extracted_emojis], axis=0)

    # Reshape for model input
    emoji_features = emoji_embeddings.reshape(1, -1)

    # Predict sentiment score
    predicted_score = model.predict(emoji_features)[0]

    return predicted_score


In [5]:
# Example inputs
user_text1 = "I love this! ❤️"  # Positive sentiment
user_text2 = "I'm so sad... 💔"  # Negative sentiment
user_text3 = "What a weird day 🤖🧐"  # Neutral sentiment
user_text4 = " 🥹"  # No emojis

# Predict sentiment scores
print(f"Sentiment Score for '{user_text1}': {predict_sentiment(user_text1)}")
print(f"Sentiment Score for '{user_text2}': {predict_sentiment(user_text2)}")
print(f"Sentiment Score for '{user_text3}': {predict_sentiment(user_text3)}")
print(f"Sentiment Score for '{user_text4}': {predict_sentiment(user_text4)}")


Sentiment Score for 'I love this! ❤️': 0.5938494807907613
Sentiment Score for 'I'm so sad... 💔': 0.26083333333333336
Sentiment Score for 'What a weird day 🤖🧐': 0.5313840330116019
Sentiment Score for ' 🥹': 0.5938494807907613


# pipeline for emoji analysis using only emoji embeddings as well as text embeddings

In [6]:
import re
import gensim
import numpy as np
import pickle
import emoji
import torch
from transformers import BertTokenizer, BertModel

# Load trained RandomForestRegressor model (expects 1068 features)
with open("model/emoji_sentiment_RFR.pkl", "rb") as f:
    model = pickle.load(f)

# Load Emoji2Vec model
emoji2vec_path = "emoji2vec/pre-trained/emoji2vec.bin"  # Change to your actual path
emoji_model = gensim.models.KeyedVectors.load_word2vec_format(emoji2vec_path, binary=True)

# Load BERT model for text embeddings
bert_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)




In [7]:
def extract_emojis(text):
    """Extract all emojis from text"""
    return ''.join(c for c in text if c in emoji.EMOJI_DATA)

def remove_emojis(text):
    """Remove emojis from text to extract pure words"""
    return emoji.replace_emoji(text, replace='')

def get_emoji_embedding(emoji_char):
    """Extract embedding for a given emoji"""
    try:
        return emoji_model[emoji_char]
    except KeyError:
        return np.zeros(300)  # Return zero vector if emoji not found

def get_text_embedding(text):
    """Get BERT embedding for text description"""
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=10)
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()  # Mean pooled embedding


In [8]:
def predict_sentiment(user_input):
    """Predict sentiment score based on user input containing text and emojis"""
    
    # Extract emojis and text separately
    extracted_emojis = extract_emojis(user_input)
    cleaned_text = remove_emojis(user_input)

    # Get embeddings
    emoji_embeddings = np.zeros(300)  # Default to zero vector
    if extracted_emojis:
        emoji_embeddings = np.mean([get_emoji_embedding(e) for e in extracted_emojis], axis=0)

    text_embedding = get_text_embedding(cleaned_text)

    # Combine features (Emoji2Vec + BERT) to match model input (1068-d)
    combined_features = np.hstack((emoji_embeddings, text_embedding)).reshape(1, -1)

    # Predict sentiment score
    predicted_score = model.predict(combined_features)[0]

    return predicted_score


In [9]:
# Example inputs
user_text1 = "I love this! ❤️"  # Positive sentiment
user_text2 = "I'm so sad... 😢"  # Negative sentiment
user_text3 = "What a weird day 🤖🧐"  # Neutral sentiment

# Predict sentiment scores
print(f"Sentiment Score for '{user_text1}': {predict_sentiment(user_text1)}")
print(f"Sentiment Score for '{user_text2}': {predict_sentiment(user_text2)}")
print(f"Sentiment Score for '{user_text3}': {predict_sentiment(user_text3)}")


Sentiment Score for 'I love this! ❤️': 0.5353125
Sentiment Score for 'I'm so sad... 😢': 0.5675
Sentiment Score for 'What a weird day 🤖🧐': 0.51
