In [None]:
# STEP 1: Install Required Libraries
!pip install emoji tensorflow nltk

# STEP 2: Import Libraries
import pandas as pd
import numpy as np
import nltk
import emoji
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# STEP 3: Download NLTK Data (once only)
nltk.download('punkt')

# STEP 4: Sample Dataset (You can replace this with your own dataset)
# Simulated dataset with text and emojis
data = pd.DataFrame({
    'text': ['I love pizza', 'Good night', 'I am so happy', 'Feeling sad today', 'Let’s go party', 'I am angry'],
    'emoji': ['🍕', '🌙', '😊', '😢', '🎉', '😡']
})

# STEP 5: Preprocess Text
data['text'] = data['text'].str.lower()

# STEP 6: Tokenize Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
padded = pad_sequences(sequences, maxlen=10)

# STEP 7: Encode Emoji Labels
emoji_mapping = {e: i for i, e in enumerate(data['emoji'].unique())}
reverse_mapping = {i: e for e, i in emoji_mapping.items()}
data['emoji_label'] = data['emoji'].map(emoji_mapping)
y = np.array(data['emoji_label'])

# STEP 8: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size=0.2, random_state=42)

# STEP 9: Define Model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=10),
    LSTM(64),
    Dense(len(emoji_mapping), activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# STEP 10: Train Model
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

# STEP 11: Test Prediction
def predict_emoji(text):
    seq = tokenizer.texts_to_sequences([text.lower()])
    pad = pad_sequences(seq, maxlen=10)
    pred = model.predict(pad)
    emoji_pred = reverse_mapping[np.argmax(pred)]
    return emoji_pred

# Example
print("Text: I am very happy")
print("Predicted Emoji:", predict_emoji("I am very happy"))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.3333 - loss: 1.7820 - val_accuracy: 1.0000 - val_loss: 1.7717
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - accuracy: 0.3333 - loss: 1.7672 - val_accuracy: 0.0000e+00 - val_loss: 1.7800
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step - accuracy: 0.6667 - loss: 1.7519 - val_accuracy: 0.0000e+00 - val_loss: 1.7890
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.6667 - loss: 1.7358 - val_accuracy: 0.0000e+00 - val_loss: 1.7990
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step - accuracy: 0.6667 - loss: 1.7185 - val_accuracy: 0.0000e+00 - val_loss: 1.8104
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - accuracy: 0.6667 - loss: 1.6998 - val_accuracy: 0.0000e+00 - val_loss: 1.8235
Epoch 7/10
[1m1/1[0m [32m━━━━━━

In [4]:
# STEP 1: Install Required Libraries
!pip install emoji tensorflow nltk

# STEP 2: Import Libraries
import pandas as pd
import numpy as np
import nltk
import re
import emoji
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# STEP 3: Download NLTK Data
nltk.download('punkt')

# STEP 4: Load Dataset (from CSV)
data = pd.read_csv("/content/large_emoji_dataset.csv")  # use your path if different

# STEP 5: Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

data['text'] = data['text'].apply(clean_text)

# STEP 6: Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
max_len = max([len(seq) for seq in sequences])
padded = pad_sequences(sequences, maxlen=max_len)

# STEP 7: Emoji Encoding
emoji_mapping = {e: i for i, e in enumerate(data['emoji'].unique())}
reverse_mapping = {i: e for e, i in emoji_mapping.items()}
data['emoji_label'] = data['emoji'].map(emoji_mapping)
y = np.array(data['emoji_label'])

# STEP 8: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size=0.2, random_state=42)

# STEP 9: Define the LSTM Model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_len),
    LSTM(64),
    Dropout(0.5),
    Dense(len(emoji_mapping), activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# STEP 10: Train the Model
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

# STEP 11: Emoji Prediction Function for User Input
def predict_emoji(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(pad)[0]
    top_idx = np.argmax(pred)
    confidence = pred[top_idx]
    return f"Predicted Emoji: {reverse_mapping[top_idx]} (Confidence: {confidence:.2f})"

# STEP 12: Try it with User Input
user_input = input("Enter a message: ")
print(predict_emoji(user_input))


Epoch 1/10


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 98ms/step - accuracy: 0.2289 - loss: 2.9806 - val_accuracy: 0.6250 - val_loss: 2.9380
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.6058 - loss: 2.9257 - val_accuracy: 0.7917 - val_loss: 2.8705
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6059 - loss: 2.8647 - val_accuracy: 0.7917 - val_loss: 2.7788
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.6520 - loss: 2.7702 - val_accuracy: 0.7917 - val_loss: 2.6415
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.6533 - loss: 2.6396 - val_accuracy: 0.7917 - val_loss: 2.4228
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.5719 - loss: 2.4158 - val_accuracy: 0.7083 - val_loss: 2.0753
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m