In [1]:
import pandas as pd
import numpy as np
import nltk
import joblib
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from xgboost import XGBClassifier

# NLTK setup
nltk.download('punkt')
# change the path to your nltk_data directory as needed
nltk.data.path.append(r"../nltk_data")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load word2vec_tf-idf features and labels
# change the paths to your datasets as needed
tfidf_train = sparse.load_npz(r"../datasets/tfidf_train_matrix.npz")
tfidf_test = sparse.load_npz(r"../datasets/tfidf_test_matrix.npz")
w2v_train = np.load(r"../datasets/w2v_tfidf_train_features.npy")
w2v_test = np.load(r"../datasets/w2v_tfidf_test_features.npy")
train_df = pd.read_csv(r"../datasets/cleaned_train.csv")
test_df = pd.read_csv(r"../datasets/cleaned_test.csv")
print("Loaded train/test features and labels.")

Loaded train/test features and labels.


In [3]:
# Combine sparse TF-IDF and dense W2V into one big matrix
full_train_features = np.hstack([tfidf_train.toarray(), w2v_train])
full_test_features = np.hstack([tfidf_test.toarray(), w2v_test])

#get labels
labels = train_df['target'].values

In [4]:
# Load features and labels
X = full_train_features
y = labels

# Split into train/validation (80/20) for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train the XGBoost model
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [5]:
# Evaluate
y_pred = xgb_model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))

Accuracy: 0.7879185817465528
F1 Score: 0.7297071129707113

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.88      0.83       869
           1       0.81      0.67      0.73       654

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.78      1523


Confusion Matrix:
 [[764 105]
 [218 436]]


In [6]:
# Save evaluations for comparison
model_name = "XG Boost"

metrics = {
    "model": model_name,
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred, average="binary"),
    "recall": recall_score(y_val, y_pred, average="binary"),
    "f1score": f1_score(y_val, y_pred, average="binary")
}

scores = pd.read_csv("../evaluation/scores.csv")

match = scores["model"] == metrics["model"]
if match.any():
    # Update existing row
    scores.loc[match, ["accuracy", "precision", "recall", "f1score"]] = metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1score"]
else:
    # Insert new row
    scores = pd.concat([scores, pd.DataFrame([metrics])], ignore_index=True)

scores.to_csv("../evaluation/scores.csv", index=False)

print("Model scores saved to evaluation/scores.csv")

Model scores saved to evaluation/scores.csv


In [7]:
# Save the model for modular use
joblib.dump(xgb_model, "gradient_boosted_model.pkl")
print("Model saved as gradient_boosted_model.pkl")

Model saved as gradient_boosted_model.pkl


In [8]:
# Predict on test set (for submission or inspection)
test_preds = xgb_model.predict(full_test_features)

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(test_preds)),
    'target': test_preds
})
submission.to_csv("gradient_boost_submission.csv", index=False)
print("Test predictions saved to gradient_boost_submission.csv")

Test predictions saved to gradient_boost_submission.csv
