In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
file_path = "/kaggle/input/twitter-emotion-dataset/text.csv"
df = pd.read_csv(file_path)
df.info(), df.head()
# Drop unnecessary column
df = df.drop(columns=['Unnamed: 0'])

emotion_mapping = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Create pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000)),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on test data
y_pred = pipeline.predict(X_test)

# Evaluation report
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

(report, conf_matrix)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


({'0': {'precision': 0.9354363569412053,
   'recall': 0.9447130283872567,
   'f1-score': 0.940051807080301,
   'support': 24201},
  '1': {'precision': 0.9141539429090528,
   'recall': 0.9289873597500355,
   'f1-score': 0.9215109624020429,
   'support': 28164},
  '2': {'precision': 0.8007575757575758,
   'recall': 0.762736325588108,
   'f1-score': 0.7812846477936285,
   'support': 6929},
  '3': {'precision': 0.9016436439937052,
   'recall': 0.9014072196486321,
   'f1-score': 0.9015254163206434,
   'support': 11441},
  '4': {'precision': 0.8494680290740546,
   'recall': 0.8405253283302064,
   'f1-score': 0.8449730182846964,
   'support': 9594},
  '5': {'precision': 0.7638136511375948,
   'recall': 0.6973293768545994,
   'f1-score': 0.7290589451913133,
   'support': 3033},
  'accuracy': 0.8973393152755452,
  'macro avg': {'precision': 0.8608788666355313,
   'recall': 0.8459497730931398,
   'f1-score': 0.8530674661787709,
   'support': 83362},
  'weighted avg': {'precision': 0.896275571250

In [2]:
custom_text = ["I would think that whomever would be lucky enough to stay in this suite must feel like it is the most romantic place on earth"]
predicted_label = pipeline.predict(custom_text)[0]
predicted_emotion = emotion_mapping[predicted_label]

predicted_emotion, report

('love',
 {'0': {'precision': 0.9354363569412053,
   'recall': 0.9447130283872567,
   'f1-score': 0.940051807080301,
   'support': 24201},
  '1': {'precision': 0.9141539429090528,
   'recall': 0.9289873597500355,
   'f1-score': 0.9215109624020429,
   'support': 28164},
  '2': {'precision': 0.8007575757575758,
   'recall': 0.762736325588108,
   'f1-score': 0.7812846477936285,
   'support': 6929},
  '3': {'precision': 0.9016436439937052,
   'recall': 0.9014072196486321,
   'f1-score': 0.9015254163206434,
   'support': 11441},
  '4': {'precision': 0.8494680290740546,
   'recall': 0.8405253283302064,
   'f1-score': 0.8449730182846964,
   'support': 9594},
  '5': {'precision': 0.7638136511375948,
   'recall': 0.6973293768545994,
   'f1-score': 0.7290589451913133,
   'support': 3033},
  'accuracy': 0.8973393152755452,
  'macro avg': {'precision': 0.8608788666355313,
   'recall': 0.8459497730931398,
   'f1-score': 0.8530674661787709,
   'support': 83362},
  'weighted avg': {'precision': 0.896