In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Step 1: Load the emoji dataset
dataset = load_dataset("tweet_eval", "emoji")


In [3]:
# Step 2: Convert to Pandas DataFrame
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

In [4]:
# Optional: Combine train and test sets (or use train only)
df = pd.concat([train_df, test_df], ignore_index=True)

In [5]:
# Step 3: Prepare features and labels
X = df['text']
y = df['label']

In [6]:
# Step 4: Encode the labels to emojis
label_mapping = dataset['train'].features['label'].names
label_encoder = LabelEncoder()
label_encoder.fit(range(len(label_mapping)))
y_labels = label_encoder.transform(y)

In [7]:
# Step 5: Vectorize text using TF-IDF
# vectorizer = TfidfVectorizer(max_features=1000)
# X_vectorized = vectorizer.fit_transform(X)
# Increase number of features from 1000 to 5000
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(df['text'])

In [8]:
# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_labels, test_size=0.2, random_state=42)


In [9]:
# Step 7: Train classifier
# clf = LogisticRegression(max_iter=1000)
# clf.fit(X_train, y_train)
# Add class_weight='balanced' to help with imbalance
clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)

In [10]:
# Step 8: Evaluate performance
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_mapping))


Classification Report:
              precision    recall  f1-score   support

           ❤       0.51      0.09      0.16      4104
           😍       0.29      0.16      0.21      1966
           😂       0.44      0.31      0.36      1838
           💕       0.14      0.15      0.14       942
           🔥       0.42      0.39      0.40      1154
           😊       0.08      0.08      0.08       740
           😎       0.14      0.13      0.14       780
           ✨       0.23      0.23      0.23      1037
           💙       0.09      0.17      0.12       525
           😘       0.10      0.23      0.14       488
           📷       0.15      0.21      0.18       714
          🇺🇸       0.36      0.55      0.44       554
           ☀       0.23      0.51      0.32       520
           💜       0.08      0.16      0.11       442
           😉       0.08      0.14      0.10       518
           💯       0.12      0.29      0.17       397
           😁       0.07      0.12      0.09       481
    

In [11]:
# Step 9: Emoji prediction function
def predict_emoji(text):
    vec = vectorizer.transform([text])
    pred_label = clf.predict(vec)[0]
    return label_mapping[pred_label]

In [15]:
# Step 10: Interactive prediction
while True:
    sentence = input("Enter a sentence (or 'exit' to stop): ")
    if sentence.lower() == 'exit':
        break
    emoji = predict_emoji(sentence)
    print("Predicted Emoji:", emoji)

Enter a sentence (or 'exit' to stop):  exit
