In [4]:
import pandas as pd
import numpy as np
# Load dataset from .parquet file
df = pd.read_csv("dataset/final_data.csv")

# Display first few rows
print(df.head())


                    description  score emoji
0                 grinning face   1.00     😀
1  smiling face with open mouth   1.00     😃
2                  winking face   1.00     😉
3                    robot face   0.50     🤖
4              father christmas   0.75    🎅🏻


In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Function to get text embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=50)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy().squeeze()  # Take only CLS token


# Apply to dataset
df["text_embedding"] = df["description"].apply(get_bert_embedding)




In [None]:
from gensim.models import KeyedVectors

# Load Emoji2Vec pre-trained embeddings
emoji2vec_model = KeyedVectors.load_word2vec_format("emoji2vec/pre-trained/emoji2vec.bin", binary=True)

# Function to get emoji embeddings
def get_emoji_embedding(emoji):
    try:
        return emoji2vec_model[emoji]
    except KeyError:
        return np.zeros((300,))  # 300D Zero vector if emoji not in vocab


# Apply to dataset
df["emoji_embedding"] = df["emoji"].apply(get_emoji_embedding)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Convert embeddings to NumPy arrays
X_text = np.stack(df["text_embedding"].values)
X_emoji = np.stack(df["emoji_embedding"].values)

# Concatenate text & emoji embeddings
X = np.hstack((X_text, X_emoji))
y = df["score"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Evaluate performance
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")


Mean Squared Error: 0.0210


In [5]:
import pickle

# Save the model
with open("emoji_sentiment_RFR.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Model saved successfully!


# model trained on only emoji embeddings 

In [6]:
import pickle
import numpy as np
import gensim
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load Emoji2Vec model
emoji_model = gensim.models.KeyedVectors.load_word2vec_format("emoji2vec/pre-trained/emoji2vec.bin", binary=True)

def get_emoji_embedding(emoji):
    """Extract embedding for a given emoji"""
    try:
        return emoji_model[emoji]
    except KeyError:
        return np.zeros(300)  # Return zero vector if emoji is not found

# Load your dataset (assuming it's a CSV with 'emoji' and 'score' columns)
import pandas as pd
df = pd.read_csv("dataset/final_data.csv")

# Convert emojis to embeddings
X = np.vstack([get_emoji_embedding(e) for e in df["emoji"]])
y = df["score"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the new model
with open("emoji_sentiment_RFR_300d.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model retrained using only 300D emoji embeddings and saved as emoji_sentiment_RFR_300d.pkl")


✅ Model retrained using only 300D emoji embeddings and saved as emoji_sentiment_RFR_300d.pkl


In [21]:
import pickle

# Load the trained model
with open("model/emoji_sentiment_RFR_300d.pkl", "rb") as f:
    model = pickle.load(f)

# Predict sentiment score for an emoji
emoji_embedding = get_emoji_embedding("😟").reshape(1, -1)
predicted_score = model.predict(emoji_embedding)[0]

print(f"Predicted Sentiment Score: {predicted_score}")


Predicted Sentiment Score: 0.2375


# emoji sentiment model using XGBoost and LightGBM to improve accuracy

In [46]:
pip install xgboost lightgbm scikit-learn numpy


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [47]:
import gensim
import numpy as np
import pickle
import emoji
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Load Emoji2Vec model
emoji2vec_path = "emoji2vec/pre-trained/emoji2vec.bin"  # Change to your actual path
emoji_model = gensim.models.KeyedVectors.load_word2vec_format(emoji2vec_path, binary=True)

# Load dataset (Replace with your dataset)
import pandas as pd
df = pd.read_csv("dataset/final_data.csv")  # Your dataset with 'emoji' and 'score' columns


In [48]:
def get_emoji_embedding(emoji_char):
    """Extract embedding for a given emoji"""
    try:
        return emoji_model[emoji_char]
    except KeyError:
        return np.zeros(300)  # Return zero vector if emoji not found

# Convert emojis to embeddings
df["emoji_embedding"] = df["emoji"].apply(lambda e: get_emoji_embedding(e))

# Stack embeddings into an array
X = np.vstack(df["emoji_embedding"].values)
y = df["score"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
# Train XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)
xgb_model.fit(X_train, y_train)

# Train LightGBM
lgbm_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5)
lgbm_model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 2200, number of used features: 300
[LightGBM] [Info] Start training from score 0.638068


In [50]:
# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lgbm = lgbm_model.predict(X_test)

# Compute Mean Squared Error
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)

print(f"XGBoost MSE: {mse_xgb:.4f}")
print(f"LightGBM MSE: {mse_lgbm:.4f}")


XGBoost MSE: 0.0343
LightGBM MSE: 0.0332


In [51]:
best_model = xgb_model if mse_xgb < mse_lgbm else lgbm_model
model_name = "emoji_sentiment_XGB.pkl" if mse_xgb < mse_lgbm else "emoji_sentiment_LGBM.pkl"

with open(model_name, "wb") as f:
    pickle.dump(best_model, f)

print(f"Best model saved as {model_name}!")


Best model saved as emoji_sentiment_LGBM.pkl!


# implement ensemble learning to combine Random Forest, LightGBM, and XGBoost

In [52]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)
rf_model.fit(X_train, y_train)

# Train XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Train LightGBM
lgbm_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
lgbm_model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 2200, number of used features: 300
[LightGBM] [Info] Start training from score 0.638068


In [53]:
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lgbm = lgbm_model.predict(X_test)


In [54]:
# Weighted averaging (adjust weights based on performance)
y_pred_ensemble = (0.5 * y_pred_rf) + (0.3 * y_pred_lgbm) + (0.2 * y_pred_xgb)

# Calculate the Mean Squared Error (MSE)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
print(f"Ensemble Model MSE: {mse_ensemble:.4f}")


Ensemble Model MSE: 0.0340


In [55]:
import pickle

with open("emoji_sentiment_ensemble.pkl", "wb") as f:
    pickle.dump((rf_model, lgbm_model, xgb_model), f)

print("Ensemble model saved successfully! ✅")


Ensemble model saved successfully! ✅
