In [1]:
# Core packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv("emotion_dataset_raw.csv")

In [3]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Encoded_Emotion'] = le.fit_transform(df['Emotion'])


In [4]:
# Features and Labels
X = df['Text']
y = df['Encoded_Emotion']


In [5]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

In [6]:
# Handle Imbalance with Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_tfidf, y)


In [7]:
# Split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)


In [8]:
# Train XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# Evaluate
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.7752753885619436
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.71      0.74      3355
           1       0.80      0.97      0.88      3261
           2       0.86      0.69      0.77      3365
           3       0.59      0.62      0.61      3301
           4       0.75      0.95      0.84      3292
           5       0.70      0.58      0.64      3341
           6       1.00      1.00      1.00      3298
           7       0.73      0.68      0.71      3295

    accuracy                           0.78     26508
   macro avg       0.78      0.78      0.77     26508
weighted avg       0.78      0.78      0.77     26508



In [10]:
# Check the type and content of X_train
print(type(X_train))  # Should be a list or pandas Series of strings
print(X_train[:5])    # Print the first 5 entries to inspect



<class 'scipy.sparse._csr.csr_matrix'>
  (0, 33150)	0.08388529729376831
  (0, 32607)	0.15816025092914965
  (0, 35786)	0.16213310073637524
  (0, 23415)	0.09862935352133702
  (0, 2663)	0.13147734949382983
  (0, 36685)	0.24085935653277896
  (0, 35508)	0.12084526454479673
  (0, 26788)	0.19927753636892923
  (0, 13562)	0.1803888824608133
  (0, 19645)	0.2672684328994135
  (0, 33172)	0.16542554891228606
  (0, 902)	0.273490876601205
  (0, 8985)	0.1936346432547138
  (0, 12909)	0.23412303097862916
  (0, 31003)	0.2643146567919258
  (0, 9243)	0.3290939907851372
  (0, 22469)	0.3852329893650172
  (0, 22486)	0.42107731991765124
  (1, 14571)	0.3364235320953441
  (1, 9532)	0.42338980735792026
  (1, 27679)	0.8411660228979273
  (2, 1953)	0.10836614604545461
  (2, 32722)	0.20658478267310532
  (2, 29299)	0.19388130193118203
  (2, 18726)	0.3643831479508764
  :	:
  (3, 34534)	0.395683109494895
  (3, 29546)	0.7649671775145319
  (4, 33150)	0.07161128429041762
  (4, 12283)	0.09742149238842564
  (4, 32607)	0.0675

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Clean the data
X_train_clean = [str(text) if text is not None else "" for text in X_train]

# Step 3: Apply vectorization
X_tfidf = vectorizer.fit_transform(X_train_clean)


In [14]:
# Step 2: Apply RandomOverSampler on the sparse matrix (no conversion to dense)
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_tfidf, y_train)


In [15]:
# Step 3: Train the XGBoost model on the resampled sparse matrix
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_resampled, y_resampled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [18]:
# Save the trained model, vectorizer, and oversampler together in a pipeline format
import joblib
from sklearn.pipeline import Pipeline


# Create a pipeline containing the steps
pipe_updated = Pipeline([
    ('tfidf', vectorizer),           # Use the same vectorizer
    ('model', model)                  # Use the trained model
])

# Save the updated pipeline
joblib.dump(pipe_updated, "emotion_classifier_xgb_updated.pkl")


['emotion_classifier_xgb_updated.pkl']

In [19]:
import pickle

# Save model
with open("emotion_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save vectorizer
with open("vectorizer.pkl", "wb") as vec_file:
    pickle.dump(tfidf, vec_file)
