# Logistic Regression

In [None]:
# !pip install --upgrade gensim
# !pip install tensorflow_hub
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import tensorflow as tf
import tensorflow_hub as hub


Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow_hub
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl (30 kB)
Installing collected packages: tensorflow_hub
Successfully installed tensorflow_hub-0.16.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [9]:
df = pd.read_csv("en_hf_112024.csv")

def basic_preprocessing(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = text.strip()
    return text

df['clean_text'] = df['text'].astype(str).apply(basic_preprocessing)

X = df['clean_text'].values
y = df['labels'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [17]:
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from imblearn.under_sampling import RandomUnderSampler

tfidf_vectorizer = TfidfVectorizer(
    min_df=3, 
    max_df=0.9, 
    ngram_range=(1,2),  
    max_features=10000,  # Reduced vocabulary size for better precision
    analyzer='word',  
    stop_words='english'  
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Apply undersampling instead of SMOTE
rus = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
X_train_tfidf_resampled, y_train_resampled = rus.fit_resample(X_train_tfidf, y_train)

##########################
# 2) Logistic Regression (Optimized for Balanced Recall & Precision)
##########################
lr_tfidf = LogisticRegression(
    max_iter=1000, 
    C=0.2,  # More regularization to improve generalization
    class_weight="balanced",
    solver="liblinear",  
    n_jobs=-1  
)

lr_tfidf.fit(X_train_tfidf_resampled, y_train_resampled)

# Get probabilities for threshold adjustment
y_proba_tfidf = lr_tfidf.predict_proba(X_test_tfidf)[:, 1]

# Adjust threshold to improve precision
threshold = 0.5  # Increased threshold to reduce false positives
y_pred_tfidf = (y_proba_tfidf >= threshold).astype(int)

print("=== TF-IDF (Optimized for Precision & Recall) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Recall:", precision_recall_fscore_support(y_test, y_pred_tfidf, average='binary')[1])
print("Precision:", precision_recall_fscore_support(y_test, y_pred_tfidf, average='binary')[0])
print(classification_report(y_test, y_pred_tfidf))



=== TF-IDF (Optimized for Precision & Recall) ===
Accuracy: 0.7763075770815129
Recall: 0.703169881001231
Precision: 0.57002079002079
              precision    recall  f1-score   support

         0.0       0.88      0.80      0.84     52603
         1.0       0.57      0.70      0.63     19496

    accuracy                           0.78     72099
   macro avg       0.72      0.75      0.73     72099
weighted avg       0.80      0.78      0.78     72099



# Word2Vec

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_recall_fscore_support
)
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings('ignore')

# 1. Load and Preprocess Dataset
df = pd.read_csv("en_hf_112024.csv")

def basic_preprocessing(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = text.strip()
    return text

df['clean_text'] = df['text'].astype(str).apply(basic_preprocessing)

X = df['clean_text'].values
y = df['labels'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 2. Train Word2Vec Model on Training Data
train_tokens = [simple_preprocess(doc) for doc in X_train]

w2v_model = Word2Vec(
    sentences=train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,  # skip-gram
    epochs=10
)

# 3. Convert Sentences to Average Word2Vec Vectors
def get_average_vector(tokens, model, vector_size=100):
    valid_tokens = [t for t in tokens if t in model.wv.key_to_index]
    if not valid_tokens:
        return np.zeros(vector_size)
    return np.mean([model.wv[t] for t in valid_tokens], axis=0)

X_train_w2v = np.array([
    get_average_vector(simple_preprocess(doc), w2v_model)
    for doc in X_train
])

X_test_w2v = np.array([
    get_average_vector(simple_preprocess(doc), w2v_model)
    for doc in X_test
])

# 4. Apply Random UnderSampling to handle class imbalance
rus = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
X_train_w2v_resampled, y_train_resampled = rus.fit_resample(X_train_w2v, y_train)

# 5. Train Logistic Regression
lr_w2v = LogisticRegression(
    max_iter=1000,
    C=0.2,
    class_weight='balanced',
    solver='liblinear',
    n_jobs=-1
)

lr_w2v.fit(X_train_w2v_resampled, y_train_resampled)

# 6. Get Probabilities for Threshold Tuning
y_proba_w2v = lr_w2v.predict_proba(X_test_w2v)[:, 1]

# 7. Threshold Tuning Loop
print("=== Threshold Tuning (Word2Vec + Logistic Regression) ===")
best_f1 = 0
best_threshold = 0.5

for threshold in [0.4, 0.45, 0.48, 0.5, 0.52, 0.55]:
    y_pred_thresh = (y_proba_w2v >= threshold).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred_thresh, average='binary', zero_division=0
    )
    print(f"Threshold: {threshold:.2f} | Precision: {precision:.3f} | Recall: {recall:.3f} | F1: {f1:.3f}")
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# 8. Final Evaluation Using Best Threshold
print("\n=== Final Evaluation at Best Threshold ===")
y_pred_final = (y_proba_w2v >= best_threshold).astype(int)
print(f"Best Threshold: {best_threshold:.2f}")
print("Accuracy:", accuracy_score(y_test, y_pred_final))
print("Recall:", precision_recall_fscore_support(y_test, y_pred_final, average='binary')[1])
print("Precision:", precision_recall_fscore_support(y_test, y_pred_final, average='binary')[0])
print(classification_report(y_test, y_pred_final))


=== Threshold Tuning (Word2Vec + Logistic Regression) ===
Threshold: 0.40 | Precision: 0.401 | Recall: 0.851 | F1: 0.545
Threshold: 0.45 | Precision: 0.434 | Recall: 0.774 | F1: 0.556
Threshold: 0.48 | Precision: 0.458 | Recall: 0.719 | F1: 0.560
Threshold: 0.50 | Precision: 0.474 | Recall: 0.684 | F1: 0.560
Threshold: 0.52 | Precision: 0.490 | Recall: 0.648 | F1: 0.558
Threshold: 0.55 | Precision: 0.514 | Recall: 0.591 | F1: 0.550

=== Final Evaluation at Best Threshold ===
Best Threshold: 0.50
Accuracy: 0.7091083094078975
Recall: 0.6844480919162905
Precision: 0.47377951358068526
              precision    recall  f1-score   support

         0.0       0.86      0.72      0.78     52603
         1.0       0.47      0.68      0.56     19496

    accuracy                           0.71     72099
   macro avg       0.67      0.70      0.67     72099
weighted avg       0.76      0.71      0.72     72099



# XGBoost

In [19]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, make_scorer, precision_recall_fscore_support
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

# 1.1 Load dataset
df = pd.read_csv("en_hf_112024.csv")

# 1.2 Basic text cleaning
def basic_preprocessing(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = text.strip()
    return text

df["clean_text"] = df["text"].astype(str).apply(basic_preprocessing)

# 1.3 Separate features & labels
X = df["clean_text"].values
y = df["labels"].values

# 1.4 Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 1.5 Vectorize with TF-IDF
tfidf = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    ngram_range=(1,2),
    max_features=30000,     # Adjust based on memory/performance
    stop_words="english"    # Common practice to remove English stopwords
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

print("TF-IDF shape:", X_train_tfidf.shape)


TF-IDF shape: (288394, 30000)


In [None]:
# from sklearn.metrics import recall_score, make_scorer

# # We want recall for the positive class = 1
# recall_scorer = make_scorer(recall_score, pos_label=1)

# xgb_clf = XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='logloss',
#     random_state=42
# )

# # Wide param distributions for random search
# param_distributions = {
#     "n_estimators":      [100, 300, 500, 700],
#     "max_depth":         [3, 5, 7, 9],
#     "learning_rate":     [0.01, 0.05, 0.1, 0.2],
#     "subsample":         [0.6, 0.8, 1.0],
#     "colsample_bytree":  [0.6, 0.8, 1.0],
#     # If class 1 is ~27% of data: scale_pos_weight ~ (majority/minority). 
#     # For exact ratio: scale_pos_weight = (count(0) / count(1)).
#     # This helps XGBoost handle imbalance better. Let's guess around 2..5.
#     "scale_pos_weight":  [1, 2, 3, 4, 5]  
# }
# random_search = RandomizedSearchCV(
#     estimator=xgb_clf,
#     param_distributions=param_distributions,
#     n_iter=20,                    # number of random samples
#     scoring=recall_scorer,        # focusing on recall for class 1
#     cv=3,                         # 3-fold cross-validation
#     verbose=2,
#     random_state=42,
#     n_jobs=-1                     # use all CPU cores
# )

# random_search.fit(X_train_tfidf, y_train)

# print("\n=== Random Search Results ===")
# print("Best Params:", random_search.best_params_)
# print("Best Recall Score:", random_search.best_score_)

# best_params = random_search.best_params_


# param_grid = {
#     "n_estimators":     [best_params['n_estimators'] - 100, best_params['n_estimators'], best_params['n_estimators'] + 100],
#     "max_depth":        [best_params['max_depth'] - 1, best_params['max_depth'], best_params['max_depth'] + 1],
#     "learning_rate":    [best_params['learning_rate'] * 0.5, best_params['learning_rate'], best_params['learning_rate'] * 1.5],
#     "scale_pos_weight": [max(best_params['scale_pos_weight']-1,1), best_params['scale_pos_weight'], best_params['scale_pos_weight']+1]
# }
# # We can keep sub-sample, colsample_bytree the same or also do a small +/- 0.1.

# grid_search = GridSearchCV(
#     estimator=XGBClassifier(
#         objective='binary:logistic',
#         eval_metric='logloss',
#         use_label_encoder=False,
#         random_state=42
#     ),
#     param_grid=param_grid,
#     scoring=recall_scorer,
#     cv=3,
#     verbose=2,
#     n_jobs=-1
# )

# grid_search.fit(X_train_tfidf, y_train)

# print("\n=== Grid Search Results ===")
# print("Best Params:", grid_search.best_params_)
# print("Best Recall Score:", grid_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, scale_pos_weight=2, subsample=0.8; total time=10.6min
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, scale_pos_weight=2, subsample=0.8; total time=10.9min
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, scale_pos_weight=2, subsample=0.8; total time=10.9min
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=7, n_estimators=300, scale_pos_weight=2, subsample=1.0; total time=12.9min
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=7, n_estimators=300, scale_pos_weight=2, subsample=1.0; total time=13.2min
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=7, n_estimators=300, scale_pos_weight=2, subsample=1.0; total time=13.4min
[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=4, subsample=0.6; total tim

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.1s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.0s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 6.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 6.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 6.7min
[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 6.7min


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 6.8min
[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 6.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 6.7min


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 6.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.4s
[CV] END learning_rate=0.005, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 7.0min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=13.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=13.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=14.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=14.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=14.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=14.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=14.5min
[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=14.5min


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=14.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=12.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=12.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=13.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=12.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=12.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   2.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.9s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   2.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   2.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=14.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=25.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=26.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=26.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=27.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=27.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=25.9min
[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=18.0min


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=17.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=25.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=26.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=25.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=17.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=18.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=18.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=18.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=17.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=17.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.2s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=19.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 6.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 7.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 7.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 7.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=38.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 7.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 7.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=36.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=38.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=38.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 7.0min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 7.0min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 7.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=37.0min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=39.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=37.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=38.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.3s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=12.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.005, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=36.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=12.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=12.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=12.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=12.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=12.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=12.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=12.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=12.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=12.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=12.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=12.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=12.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=12.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=12.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=12.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=12.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=22.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=21.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=22.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=22.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=23.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=22.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=22.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=19.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=23.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=22.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=19.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=18.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=19.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=19.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=20.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=19.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=20.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=4; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   1.8s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=0, scale_pos_weight=5; total time=   2.0s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=19.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 6.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 7.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=3; total time= 7.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 7.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=35.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=36.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=36.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=37.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 7.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=4; total time= 6.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 6.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 6.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=100, scale_pos_weight=5; total time= 6.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=37.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=34.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=11.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=35.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.1s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=4; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=35.0min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=11.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.01, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=37.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=3; total time=12.0min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=11.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=11.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=4; total time=11.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=11.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=11.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=6, n_estimators=200, scale_pos_weight=5; total time=12.1min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=3; total time=11.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=11.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=12.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=4; total time=11.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=11.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=12.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=3; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=4; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=0, scale_pos_weight=5; total time=   1.5s


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=100, scale_pos_weight=5; total time=11.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=20.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=20.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=3; total time=20.2min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=20.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=21.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=4; total time=20.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=20.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=20.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=7, n_estimators=200, scale_pos_weight=5; total time=21.8min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=19.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=19.7min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=3; total time=18.5min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=19.6min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=18.9min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=4; total time=20.3min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=19.4min


Parameters: { "use_label_encoder" } are not used.



[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=18.8min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=100, scale_pos_weight=5; total time=20.3min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=28.2min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=29.1min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=3; total time=28.9min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=26.5min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=20.6min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=4; total time=18.1min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=17.9min
[CV] END learning_rate=0.015, max_depth=8, n_estimators=200, scale_pos_weight=5; total time=17.0min


In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define a custom scoring function for the positive class (label = 1)
def custom_score(y_true, y_pred):
    prec = precision_score(y_true, y_pred, pos_label=1)
    rec = recall_score(y_true, y_pred, pos_label=1)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    return 0.3 * prec + 0.3 * f1 + 0.4 * rec

custom_scorer = make_scorer(custom_score)

# Initialize XGBoost classifier with base parameters
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)

# Wide parameter distributions for randomized search
param_distributions = {
    "n_estimators":      [100, 300, 500, 700],
    "max_depth":         [3, 5, 7, 9],
    "learning_rate":     [0.01, 0.05, 0.1, 0.2],
    "subsample":         [0.6, 0.8, 1.0],
    "colsample_bytree":  [0.6, 0.8, 1.0],
    "scale_pos_weight":  [1, 2, 3, 4, 5]
}

random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_distributions,
    n_iter=20,                    # Number of random parameter combinations to try
    scoring=custom_scorer,        # Custom scorer that balances precision, recall, and F1
    cv=3,                         # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1                     # Use all available CPU cores
)

# Fit RandomizedSearchCV on training data (assumes X_train_tfidf, y_train are defined)
random_search.fit(X_train_tfidf, y_train)

print("\n=== Random Search Results ===")
print("Best Params:", random_search.best_params_)
print("Best Custom Score:", random_search.best_score_)

# Get the best estimator
best_xgb = random_search.best_estimator_

# Predict on test data (assumes X_test_tfidf, y_test are defined)
y_pred = best_xgb.predict(X_test_tfidf)

# Evaluate performance using standard metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("\n=== Test Set Evaluation ===")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, scale_pos_weight=2, subsample=0.8; total time=10.4min
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, scale_pos_weight=2, subsample=0.8; total time=10.5min
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=5, n_estimators=500, scale_pos_weight=2, subsample=0.8; total time=10.5min
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=7, n_estimators=300, scale_pos_weight=2, subsample=1.0; total time=12.3min
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=7, n_estimators=300, scale_pos_weight=2, subsample=1.0; total time=12.3min
[CV] END colsample_bytree=0.6, learning_rate=0.2, max_depth=7, n_estimators=300, scale_pos_weight=2, subsample=1.0; total time=12.6min
[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=4, subsample=0.6; total tim

In [27]:
# 5.1 Get the best model from the final grid search 
# best_xgb = grid_search.best_estimator_

# 5.2 Predict on test data
y_pred = best_xgb.predict(X_test_tfidf)

# 5.3 Evaluate performance
print("\n=== Test Set Evaluation ===")
# We can see precision, recall, f1
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

# We can also see classification report
print(classification_report(y_test, y_pred))



=== Test Set Evaluation ===
Precision: 0.544, Recall: 0.770, F1: 0.638
              precision    recall  f1-score   support

         0.0       0.90      0.76      0.82     52603
         1.0       0.54      0.77      0.64     19496

    accuracy                           0.76     72099
   macro avg       0.72      0.77      0.73     72099
weighted avg       0.80      0.76      0.77     72099



In [29]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Get prediction probabilities
y_proba = best_xgb.predict_proba(X_test_tfidf)[:, 1]

# Try different thresholds
thresholds = np.arange(0.3, 0.7, 0.05)
for thresh in thresholds:
    y_thresh = (y_proba >= thresh).astype(int)
    prec = precision_score(y_test, y_thresh)
    rec = recall_score(y_test, y_thresh)
    f1 = f1_score(y_test, y_thresh)
    print(f"Threshold = {thresh:.2f} --> Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}")


Threshold = 0.30 --> Precision: 0.364, Recall: 0.947, F1: 0.526
Threshold = 0.35 --> Precision: 0.397, Recall: 0.915, F1: 0.553
Threshold = 0.40 --> Precision: 0.435, Recall: 0.878, F1: 0.582
Threshold = 0.45 --> Precision: 0.502, Recall: 0.817, F1: 0.622
Threshold = 0.50 --> Precision: 0.544, Recall: 0.770, F1: 0.638
Threshold = 0.55 --> Precision: 0.585, Recall: 0.721, F1: 0.646
Threshold = 0.60 --> Precision: 0.625, Recall: 0.670, F1: 0.647
Threshold = 0.65 --> Precision: 0.663, Recall: 0.621, F1: 0.641


In [2]:
import re
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

# Download NLTK tokenizer data (if not already present)
nltk.download('punkt')

# --- Provided code ---
df = pd.read_csv("en_hf_112024.csv")

# 1.2 Basic text cleaning
def basic_preprocessing(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = text.strip()
    return text

df["clean_text"] = df["text"].astype(str).apply(basic_preprocessing)

# 1.3 Separate features & labels
X = df["clean_text"].values
y = df["labels"].values

# 1.4 Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)

# --- Start Word2Vec and XGBoost pipeline ---

# Step 2: Tokenize the documents using NLTK's word_tokenize
def tokenize_text(text):
    return word_tokenize(text)

# Tokenize training and test texts
tokenized_train = [tokenize_text(doc) for doc in X_train]
tokenized_test = [tokenize_text(doc) for doc in X_test]

# Step 3: Train a Word2Vec model on the tokenized training data
w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=300,   # Dimensionality of the embeddings
    window=5,          # Context window size
    min_count=5,       # Ignores words with total frequency lower than this
    workers=4,         # Number of threads
    sg=1               # Use skip-gram; set to 0 for CBOW
)

# Step 4: Create document embeddings by averaging the word embeddings
def document_embedding(doc, model):
    # Filter out words not in the model's vocabulary
    valid_words = [word for word in doc if word in model.wv.index_to_key]
    if not valid_words:
        # If no valid words, return a zero vector
        return np.zeros(model.vector_size)
    return np.mean(model.wv[valid_words], axis=0)

# Compute embeddings for training and test sets
X_train_emb = np.array([document_embedding(doc, w2v_model) for doc in tokenized_train])
X_test_emb = np.array([document_embedding(doc, w2v_model) for doc in tokenized_test])

print("Train Embedding shape:", X_train_emb.shape)
print("Test Embedding shape:", X_test_emb.shape)

# Step 5: Define XGBoost classifier and a modest hyperparameter grid for RandomizedSearchCV
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)

param_distributions = {
    "n_estimators": [100, 200],
    "max_depth": [5, 7, 9],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.6, 0.8],
    "scale_pos_weight": [1, 2, 4]
}

# Set up RandomizedSearchCV (n_iter set low to ensure search completes quickly)
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_distributions,
    n_iter=10,           # Number of parameter settings that are sampled
    scoring='f1',        # F1 score is used for scoring
    cv=3,                # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Step 6: Fit the model using the training embeddings
random_search.fit(X_train_emb, y_train)

print("\n=== Random Search Results ===")
print("Best Params:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

# Step 7: Evaluate on the test set using the best estimator
best_xgb = random_search.best_estimator_
y_pred = best_xgb.predict(X_test_emb)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n=== Test Set Evaluation ===")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stygianphantom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/stygianphantom/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stygianphantom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Train Embedding shape: (288394, 300)
Test Embedding shape: (72099, 300)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, n_estimators=200, scale_pos_weight=2, subsample=1.0; total time= 2.8min
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, n_estimators=200, scale_pos_weight=2, subsample=1.0; total time= 2.9min
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=7, n_estimators=200, scale_pos_weight=2, subsample=1.0; total time= 3.0min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time= 1.5min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time= 1.5min
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time= 1.4min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max

In [None]:
print("\n=== Test Set Evaluation ===")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(classification_report(y_test, y_pred))


import numpy as np
thresholds = np.arange(0.3, 0.7, 0.05)
for thresh in thresholds:
    y_thresh = (best_xgb.predict_proba(X_test_emb)[:, 1] >= thresh).astype(int)
    prec = precision_score(y_test, y_thresh)
    rec = recall_score(y_test, y_thresh)
    f1 = f1_score(y_test, y_thresh)
    print(f"Threshold = {thresh:.2f} --> Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}")



=== Test Set Evaluation ===
Precision: 0.590, Recall: 0.630, F1: 0.609
              precision    recall  f1-score   support

         0.0       0.86      0.84      0.85     52603
         1.0       0.59      0.63      0.61     19496

    accuracy                           0.78     72099
   macro avg       0.72      0.73      0.73     72099
weighted avg       0.79      0.78      0.78     72099

Threshold = 0.30 --> Precision: 0.443, Recall: 0.856, F1: 0.584
Threshold = 0.35 --> Precision: 0.477, Recall: 0.808, F1: 0.600
Threshold = 0.40 --> Precision: 0.514, Recall: 0.754, F1: 0.611
Threshold = 0.45 --> Precision: 0.551, Recall: 0.694, F1: 0.614
Threshold = 0.50 --> Precision: 0.590, Recall: 0.630, F1: 0.609
Threshold = 0.55 --> Precision: 0.631, Recall: 0.567, F1: 0.597
Threshold = 0.60 --> Precision: 0.670, Recall: 0.499, F1: 0.572
Threshold = 0.65 --> Precision: 0.709, Recall: 0.426, F1: 0.532


In [3]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
import gensim.downloader as api

# Download necessary NLTK resources
nltk.download('punkt')

# --- Load data ---
df = pd.read_csv("en_hf_112024.csv")

# --- Basic Text Preprocessing ---
def basic_preprocessing(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    return text.strip()

df["clean_text"] = df["text"].astype(str).apply(basic_preprocessing)

# --- Split features and labels ---
X = df["clean_text"].values
y = df["labels"].values

# --- Train-test split ---
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# --- TF-IDF + TruncatedSVD ---
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train_raw)
X_test_tfidf = tfidf.transform(X_test_raw)

svd = TruncatedSVD(n_components=150, random_state=42)
X_train_tfidf_reduced = svd.fit_transform(X_train_tfidf)
X_test_tfidf_reduced = svd.transform(X_test_tfidf)

# --- Tokenization ---
def tokenize_text(text):
    return word_tokenize(text)

tokenized_train = [tokenize_text(doc) for doc in X_train_raw]
tokenized_test = [tokenize_text(doc) for doc in X_test_raw]

# --- Pretrained GloVe Embeddings ---
glove_model = api.load("glove-wiki-gigaword-300")

def document_embedding_glove(doc, model):
    valid_words = [word for word in doc if word in model]
    if not valid_words:
        return np.zeros(300)
    return np.mean([model[word] for word in valid_words], axis=0)

X_train_w2v = np.array([document_embedding_glove(doc, glove_model) for doc in tokenized_train])
X_test_w2v = np.array([document_embedding_glove(doc, glove_model) for doc in tokenized_test])

# --- Combine Features ---
X_train_combined = np.hstack([X_train_tfidf_reduced, X_train_w2v])
X_test_combined = np.hstack([X_test_tfidf_reduced, X_test_w2v])

print("Combined Train Feature shape:", X_train_combined.shape)
print("Combined Test Feature shape:", X_test_combined.shape)

# --- Train XGBoost with Default Parameters ---
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=2.7,
    random_state=42,
    n_jobs=-1
)

xgb_clf.fit(X_train_combined, y_train)

# --- Evaluation ---
y_pred = xgb_clf.predict(X_test_combined)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = xgb_clf.score(X_test_combined, y_test)

print("\n=== Test Set Evaluation ===")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stygianphantom/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Combined Train Feature shape: (288394, 450)
Combined Test Feature shape: (72099, 450)

=== Test Set Evaluation ===
Accuracy: 0.735
Precision: 0.507, Recall: 0.689, F1: 0.584
              precision    recall  f1-score   support

         0.0       0.87      0.75      0.81     52603
         1.0       0.51      0.69      0.58     19496

    accuracy                           0.73     72099
   macro avg       0.69      0.72      0.69     72099
weighted avg       0.77      0.73      0.75     72099



In [4]:
from sklearn.model_selection import GridSearchCV

xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=2.7
)

param_grid = {
    'max_depth': [5, 6, 7],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 150],
}

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='f1',
    cv=2,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_combined, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_combined)

print(classification_report(y_test, y_pred))


Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=100; total time=  54.4s
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=100; total time=  54.6s
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time= 1.0min
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=100; total time= 1.0min
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=150; total time= 1.2min
[CV] END ..learning_rate=0.05, max_depth=5, n_estimators=150; total time= 1.2min
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time= 1.3min
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=100; total time= 1.3min
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=150; total time= 1.4min
[CV] END ..learning_rate=0.05, max_depth=6, n_estimators=150; total time= 1.4min
[CV] END ..learning_rate=0.05, max_depth=7, n_estimators=150; total time= 1.6min
[CV] END ...learning_rate=0.1, max_depth=5, n_es

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.78      0.83     52603
         1.0       0.54      0.70      0.61     19496

    accuracy                           0.76     72099
   macro avg       0.71      0.74      0.72     72099
weighted avg       0.78      0.76      0.77     72099



In [6]:
# Use only GloVe Embeddings
X_train_combined = X_train_w2v
X_test_combined = X_test_w2v

xgb_clf = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=2.7,  # Maintain same balance fix
    random_state=42,
    n_jobs=-1
)

xgb_clf.fit(X_train_combined, y_train)

y_pred = xgb_clf.predict(X_test_combined)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = xgb_clf.score(X_test_combined, y_test)

print("\n=== GloVe Only Test Set Evaluation ===")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
print(classification_report(y_test, y_pred))



=== GloVe Only Test Set Evaluation ===
Accuracy: 0.690
Precision: 0.453, Recall: 0.713, F1: 0.554
              precision    recall  f1-score   support

         0.0       0.86      0.68      0.76     52603
         1.0       0.45      0.71      0.55     19496

    accuracy                           0.69     72099
   macro avg       0.66      0.70      0.66     72099
weighted avg       0.75      0.69      0.71     72099

