In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import make_pipeline

In [2]:
train_df = pd.read_csv("../datasets/cleaned_train.csv")
texts = train_df['text_final'].fillna("")
labels = train_df['target']

tokenized = texts.apply(word_tokenize).tolist()

In [3]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=2, workers=4, sg=1, seed=42)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)
idf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

In [4]:
def get_weighted_w2v(text, model, idf_dict):
    tokens = word_tokenize(text)
    word_vecs = []
    weight_sum = 0
    for word in tokens:
        if word in model.wv and word in idf_dict:
            vec = model.wv[word] * idf_dict[word]
            word_vecs.append(vec)
            weight_sum += idf_dict[word]
    if word_vecs:
        return np.sum(word_vecs, axis=0) / weight_sum
    else:
        return np.zeros(model.vector_size)
    
embedding_features = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in texts])

In [5]:
X_tfidf = tfidf_matrix
X_embed = embedding_features
y = labels
X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, stratify=y, random_state=42)
X_train_embed, X_val_embed = train_test_split(X_embed, test_size=0.2, stratify=y, random_state=42)

model_lr = LogisticRegression(max_iter=1000).fit(X_train_tfidf, y_train)
model_rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42).fit(X_train_embed, y_train)
model_xgb = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8,
                          colsample_bytree=0.8, use_label_encoder=False, eval_metric='logloss', random_state=42).fit(X_train_embed, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
pred_lr = model_lr.predict_proba(X_val_tfidf)[:, 1]
pred_rf = model_rf.predict_proba(X_val_embed)[:, 1]
pred_xgb = model_xgb.predict_proba(X_val_embed)[:, 1]

X_stack = np.vstack((pred_lr, pred_rf, pred_xgb)).T

meta_clf = LogisticRegression()
meta_clf.fit(X_stack, y_val)  # Yes, use y_val here — we're stacking on validation data


In [7]:
final_preds = meta_clf.predict(X_stack)
print("Ensemble Accuracy:", accuracy_score(y_val, final_preds))
print("\nClassification Report:\n", classification_report(y_val, final_preds))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, final_preds))

Ensemble Accuracy: 0.8102429415627052

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84       869
           1       0.83      0.70      0.76       654

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523


Confusion Matrix:
 [[776  93]
 [196 458]]


In [8]:
# Save evaluations for comparison
model_name = "Ensemble"

metrics = {
    "model": model_name,
    "accuracy": accuracy_score(y_val, final_preds),
    "precision": precision_score(y_val, final_preds, average="binary"),
    "recall": recall_score(y_val, final_preds, average="binary"),
    "f1score": f1_score(y_val, final_preds, average="binary")
}

scores = pd.read_csv("../evaluation/scores.csv")

match = scores["model"] == metrics["model"]
if match.any():
    # Update existing row
    scores.loc[match, ["accuracy", "precision", "recall", "f1score"]] = metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1score"]
else:
    # Insert new row
    scores = pd.concat([scores, pd.DataFrame([metrics])], ignore_index=True)

scores.to_csv("../evaluation/scores.csv", index=False)

print("Model scores saved to evaluation/scores.csv")

Model scores saved to evaluation/scores.csv
