In [1]:
import pandas as pd
# Read the dataset (assuming train.txt has text + label separated by tab)
df = pd.read_csv("C:/Users/spars/Downloads/train_emojify.txt", sep=";", names=["text", "label"])

print(df.head())
print(df['label'].value_counts())

                                                text    label
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger
label
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64


In [3]:
! pip install nltk
import re
import nltk
from nltk.corpus import stopwords

Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.9.18-cp312-cp312-win_amd64.whl.metadata (41 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Downloading regex-2025.9.18-cp312-cp312-win_amd64.whl (275 kB)
Downloading click-8.3.0-py3-none-any.whl (107 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.3.0 nltk-3.9.2 regex-2025.9.18


In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def clean_text(text):
    # lowercase
    text = text.lower()
    
    # remove anything that's not a letter (keep spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # split into words
    words = text.split()
    
    # remove stopwords
    words = [w for w in words if w not in stop_words]
    
    # join back into a sentence
    return " ".join(words)

In [6]:
df['clean_text'] = df['text'].apply(clean_text)

print(df[['text', 'clean_text']].head(10))

                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   
5  ive been feeling a little burdened lately wasn...   
6  ive been taking or milligrams or times recomme...   
7  i feel as confused about life as a teenager or...   
8  i have been with petronas for years i feel tha...   
9                                i feel romantic too   

                                          clean_text  
0                              didnt feel humiliated  
1  go feeling hopeless damned hopeful around some...  
2          im grabbing minute post feel greedy wrong  
3  ever feeling nostalgic fireplace know still pr...  
4                                    feeling grouchy  
5      ive feeling little burdened lately wasnt sure 

In [7]:


from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF object
vectorizer = TfidfVectorizer(max_features=5000)  # limit to 5000 features

# Fit on training data and transform
X = vectorizer.fit_transform(df['clean_text'])

# Labels (y values)
y = df['label']

print("Shape of X:", X.shape)
print("Example vector for 1st sentence:\n", X[0])


Shape of X: (16000, 5000)
Example vector for 1st sentence:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 5000)>
  Coords	Values
  (0, 1171)	0.5951235084078971
  (0, 1630)	0.16379156905484632
  (0, 2142)	0.7867657412767967


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8621875

Classification Report:
               precision    recall  f1-score   support

       anger       0.89      0.81      0.85       427
        fear       0.85      0.77      0.81       397
         joy       0.82      0.96      0.88      1021
        love       0.89      0.62      0.74       296
     sadness       0.90      0.94      0.92       946
    surprise       0.88      0.47      0.61       113

    accuracy                           0.86      3200
   macro avg       0.87      0.76      0.80      3200
weighted avg       0.87      0.86      0.86      3200



In [9]:
# Define emotion to emoji mapping
emoji_map = {
    "joy": "😄",
    "sadness": "😢",
    "anger": "😡",
    "fear": "😨",
    "love": "❤️",
    "surprise": "😲"
    "cry" "😭",
}


In [10]:
# Example custom input
sample_text = ["owais have a gf so he is happy  "]

# Transform text using the same vectorizer
sample_vec = vectorizer.transform(sample_text)

# Predict emotion
pred_label = model.predict(sample_vec)[0]

# Map to emoji
print("Text:", sample_text[0])
print("Predicted Emotion:", pred_label)
print("Emoji:", emoji_map[pred_label])


Text: owais have a gf so he is happy  
Predicted Emotion: joy
Emoji: 😄
