In [None]:
# 📦 Install dependencies
!pip install -q imbalanced-learn lightgbm xgboost scikit-learn

# 📚 Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings("ignore")

# 📁 Upload dataset
from google.colab import files
uploaded = files.upload()

# 📄 Read dataset
file_name = next(iter(uploaded))
df = pd.read_csv(file_name) if file_name.endswith(".csv") else pd.read_excel(file_name)

# 🎯 Preprocessing
df = df.drop(columns=["file_key"], errors="ignore")
X = df.drop(columns="emotion")
y = df["emotion"]

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split text and other features
text_data = X["text"].fillna("")
X_non_text = X.drop(columns="text").fillna(0)

# Scale non-text features
X_scaled = RobustScaler().fit_transform(X_non_text)

# Feature selection
X_selected = SelectKBest(f_classif, k=min(300, X_scaled.shape[1])).fit_transform(X_scaled, y_encoded)

# Text vectorization
vectorizer = TfidfVectorizer(max_features=500)
X_text_features = vectorizer.fit_transform(text_data)

# Combine
X_combined = hstack([csr_matrix(X_selected), X_text_features]).tocsr()

# SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_combined, y_encoded)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

# Classifier
lgbm = LGBMClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, class_weight="balanced", random_state=42)
xgb = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
rf = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=42)

voting_clf = VotingClassifier(estimators=[("lgbm", lgbm), ("xgb", xgb), ("rf", rf)], voting="soft", n_jobs=-1)
voting_clf.fit(X_train, y_train)

# Predict Probabilities
probs = voting_clf.predict_proba(X_test)
class_labels = le.classes_
n_classes = len(class_labels)

# Find Best Thresholds
best_thresholds = []
for i in range(n_classes):
    precision, recall, thresholds = precision_recall_curve((y_test == i).astype(int), probs[:, i])
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
    best_thresh = thresholds[np.argmax(f1_scores)]
    best_thresholds.append(best_thresh)
    print(f"🔧 Best threshold for class '{class_labels[i]}' = {best_thresh:.3f}")

# Correct Threshold Logic
y_pred_custom = []
for sample_probs in probs:
    adjusted_probs = [prob if prob >= best_thresholds[i] else -np.inf for i, prob in enumerate(sample_probs)]
    pred = np.argmax(adjusted_probs) if not np.all(np.isneginf(adjusted_probs)) else np.argmax(sample_probs)
    y_pred_custom.append(pred)

# Evaluation
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_pred_custom, target_names=class_labels))
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred_custom):.4f}")

cm = confusion_matrix(y_test, y_pred_custom)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix - Text + Non-Text Fusion")
plt.show()