In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)   # Suppressing convergence warnings
df = pd.read_csv('dataset.csv') # Loading and inspecting the dataset
print(df.head())
df['Result'] = df['Result'].map({-1: 0, 1: 1})
X = df.drop(columns=['Result']) # Splitting the data into features (X) and target variable (y)
y = df['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Splitting the dataset into training and testing sets
xgb_classifier = XGBClassifier() # Initializing the XGBoost classifier
xgb_classifier.fit(X_train, y_train)   # Training the classifier on the training data
y_pred_train = xgb_classifier.predict(X_train)   # Making predictions on the training set
accuracy_train = accuracy_score(y_train, y_pred_train)   # Calculating accuracy for the training set
print("Accuracy with XGBoost on Training Set:", accuracy_train)
y_pred_test = xgb_classifier.predict(X_test)   # Making predictions on the testing set
accuracy_test = accuracy_score(y_test, y_pred_test)   # Calculating accuracy for the testing set
print("Accuracy with XGBoost on Testing Set:", accuracy_test)
print("Classification Report:")    # Additional evaluation metrics for the testing set
print(classification_report(y_test, y_pred_test))
auc = roc_auc_score(y_test, y_pred_test)  # Calculating the Area under the ROC Curve
print("Area under ROC Curve:", auc)


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
X = df.drop(columns=['Result'])    # Prepare features (X) and target variable (y)
y = df['Result']
bagging_classifier = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)  # Initialize BaggingClassifier and perform cross-validation
cv_bagging_decision_tree_scores = cross_val_score(bagging_classifier, X_train, y_train, cv=10, scoring='accuracy')
print("Cross-Validation Scores with Bagging Decision Trees:", cv_bagging_decision_tree_scores)
print("Mean Accuracy of Bagging Decision Tree Classifier:", cv_bagging_decision_tree_scores.mean()) 
bagging_classifier.fit(X_train, y_train)     # Train the classifier and make predictions on the test set 
y_pred_bagging = bagging_classifier.predict(X_test)
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print("Accuracy with Bagging Decision Trees on Testing Set:", accuracy_bagging)
print("Classification Report:")  # Calculate additional metrics for the testing set
print(classification_report(y_test, y_pred_bagging))
auc = roc_auc_score(y_test, y_pred_bagging)
print("Area under ROC Curve:", auc)


In [None]:
import numpy as np
from sklearn.model_selection import KFold
weights_grid = {'weight_xgb': np.arange(0, 1.1, 0.1), 'weight_bagging': np.arange(0, 1.1, 0.1)}  # Define the grid for weights
kf = KFold(n_splits=10, shuffle=True)               # Initialize KFold for cross-validation
fusion_accuracies = []
for weight_xgb in weights_grid['weight_xgb']:       # Perform interpolative fusion with cross-validation and weights 
    for weight_bagging in weights_grid['weight_bagging']:
        total_weight = weight_xgb + weight_bagging  # Normalize the weights
        if total_weight != 0:
            weight_xgb_normalized = weight_xgb / total_weight
            weight_bagging_normalized = weight_bagging / total_weight
        else:
            weight_xgb_normalized = 0
            weight_bagging_normalized = 0
        
        fold_accuracies = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]

            xgb_classifier.fit(X_train, y_train)
            bagging_classifier.fit(X_train, y_train)

            xgb_pred = xgb_classifier.predict(X_test)
            bagging_pred = bagging_classifier.predict(X_test)
            fusion_pred = weight_xgb_normalized * xgb_pred + weight_bagging_normalized * bagging_pred   # Combine predictions using the normalized weights
            fusion_pred_binary = np.where(fusion_pred > 0.5, 1, 0)
            fold_accuracy = accuracy_score(y_test, fusion_pred_binary)
            fold_accuracies.append(fold_accuracy)     # Calculate mean accuracy for this combination of weights
        mean_accuracy = np.mean(fold_accuracies)
        fusion_accuracies.append((mean_accuracy, {'weight_xgb': weight_xgb_normalized, 'weight_bagging': weight_bagging_normalized}))
best_weights = max(fusion_accuracies, key=lambda x: x[0])[1]   # Select the best weights based on the highest mean accuracy
print("Best Weights:", best_weights)


In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
 
xgb_classifier.fit(X_train, y_train)            # Fit the classifiers using the best weights
bagging_classifier.fit(X_train, y_train)
xgb_pred_train = xgb_classifier.predict(X_train) # Predictions from each classifier on the training set
bagging_pred_train = bagging_classifier.predict(X_train)
fusion_pred_train = best_weights['weight_xgb'] * xgb_pred_train + best_weights['weight_bagging'] * bagging_pred_train
fusion_pred_binary_train = np.where(fusion_pred_train > 0.5, 1, 0)  # Combine the predictions using the best weights and calculate metrics on the training set
precision_train = precision_score(y_train, fusion_pred_binary_train) # Calculate evaluation metrics on the training set
recall_train = recall_score(y_train, fusion_pred_binary_train)
f1_score_train = f1_score(y_train, fusion_pred_binary_train)
roc_auc_train = roc_auc_score(y_train, fusion_pred_binary_train)
print("Interpolative Fusion Metrics on Training Set with Best Weights:")
print("Precision:", precision_train)
print("Recall:", recall_train)
print("F1-score:", f1_score_train)
print("Area under ROC curve:", roc_auc_train)    # Predictions on the testing set and combine them using the best weights
xgb_pred_test = xgb_classifier.predict(X_test)
bagging_pred_test = bagging_classifier.predict(X_test)
fusion_pred_test = best_weights['weight_xgb'] * xgb_pred_test + best_weights['weight_bagging'] * bagging_pred_test
fusion_pred_binary_test = np.where(fusion_pred_test > 0.5, 1, 0)
precision_test = precision_score(y_test, fusion_pred_binary_test) # Calculate evaluation metrics on the testing set
recall_test = recall_score(y_test, fusion_pred_binary_test)
f1_score_test = f1_score(y_test, fusion_pred_binary_test)
roc_auc_test = roc_auc_score(y_test, fusion_pred_binary_test)
print("\nInterpolative Fusion Metrics on Testing Set with Best Weights:")
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_score_test)
print("Area under ROC curve:", roc_auc_test)


NameError: name 'xgb_classifier' is not defined

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

X_train_fusion = np.column_stack((X_train, fusion_pred_train))   # Concatenating the fusion predictions with the original dataset
X_test_fusion = np.column_stack((X_test, fusion_pred_test))
X_lr_train, X_lr_test, y_lr_train, y_lr_test = train_test_split(X_train_fusion, y_train, test_size=0.2, random_state=42) # Creating a train-test set for Linear regression
linear_reg_fusion = LinearRegression()                           # Training a Linear Regression model and predictions on the training set
linear_reg_fusion.fit(X_lr_train, y_lr_train)
y_pred_linear_regression_train = linear_reg_fusion.predict(X_lr_train)
y_pred_linear_regression_train_binary = np.where(y_pred_linear_regression_train > 0.5, 1, 0)  # Converting predictions to binary form
linear_regression_accuracy_train = accuracy_score(y_lr_train, y_pred_linear_regression_train_binary)  # Calculating accuracy of the Linear Regression model on training data
print("Accuracy of Linear Regression on Training Set with Interpolative Fusion Output:", linear_regression_accuracy_train)
precision_lr_train = precision_score(y_lr_train, y_pred_linear_regression_train_binary)   # Calculating precision, recall, F1-score, and area under ROC curve for Linear Regression
recall_lr_train = recall_score(y_lr_train, y_pred_linear_regression_train_binary) 
f1_score_lr_train = f1_score(y_lr_train, y_pred_linear_regression_train_binary)
roc_auc_lr_train = roc_auc_score(y_lr_train, y_pred_linear_regression_train)
print("Linear Regression Metrics on Training Set with Interpolative Fusion Output:")
print("Precision:", precision_lr_train)
print("Recall:", recall_lr_train)
print("F1-score:", f1_score_lr_train)
print("Area under ROC curve:", roc_auc_lr_train)
y_pred_linear_regression_test = linear_reg_fusion.predict(X_lr_test) # Getting predictions from Linear Regression model on testing data
y_pred_linear_regression_test_binary = np.where(y_pred_linear_regression_test > 0.5, 1, 0)  # Converting predictions to binary form
linear_regression_accuracy_test = accuracy_score(y_lr_test, y_pred_linear_regression_test_binary) # Calculating accuracy of the Linear Regression model on the test set
print("Accuracy of Linear Regression on Testing Set with Interpolative Fusion Output:", linear_regression_accuracy_test)
precision_lr_test = precision_score(y_lr_test, y_pred_linear_regression_test_binary)  # Calculating precision, recall, F1-score, and area under ROC curve for Linear Regression on testing set
recall_lr_test = recall_score(y_lr_test, y_pred_linear_regression_test_binary)
f1_score_lr_test = f1_score(y_lr_test, y_pred_linear_regression_test_binary)
roc_auc_lr_test = roc_auc_score(y_lr_test, y_pred_linear_regression_test_binary)
print("\nLinear Regression Metrics on Testing Set with Interpolative Fusion Output:")
print("Precision:", precision_lr_test)
print("Recall:", recall_lr_test)
print("F1-score:", f1_score_lr_test)
print("Area under ROC curve:", roc_auc_lr_test)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],             # Grid search for Random Forest hyperparameters
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier_fusion = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier_fusion, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_fusion, y_train)
best_rf_model = grid_search.best_estimator_
cv_scores = cross_val_score(best_rf_model, X_train_fusion, y_train, cv=5)  # Cross-validation scores of the best Random Forest model
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))
best_rf_model.fit(X_train_fusion, y_train)                       # Fitting best Random Forest model
y_pred_rf_train_fusion = best_rf_model.predict(X_train_fusion)
rf_accuracy_train_fusion = accuracy_score(y_train, y_pred_rf_train_fusion)  # Calculating accuracy of best Random Forest model on the training set
print("Accuracy of Random Forest classifier on Training Set with Interpolative Fusion Output:", rf_accuracy_train_fusion)
precision_rf_train = precision_score(y_train, y_pred_rf_train_fusion)
recall_rf_train = recall_score(y_train, y_pred_rf_train_fusion)    # Calculating precision, recall, F1-score, and area under ROC curve for Random Forest on training set
f1_score_rf_train = f1_score(y_train, y_pred_rf_train_fusion)
roc_auc_rf_train = roc_auc_score(y_train, y_pred_rf_train_fusion)
print("Random Forest Metrics on Training Set with Interpolative Fusion Output:")
print("Precision:", precision_rf_train)
print("Recall:", recall_rf_train)
print("F1-score:", f1_score_rf_train)
print("Area under ROC curve:", roc_auc_rf_train)
y_pred_rf_test_fusion = best_rf_model.predict(X_test_fusion)   # Making predictions and calculating accuracy using best Random Forest model on the test set
rf_accuracy_test_fusion = accuracy_score(y_test, y_pred_rf_test_fusion)
print("Accuracy of Random Forest classifier on Testing Set with Interpolative Fusion Output:", rf_accuracy_test_fusion)
precision_rf_test = precision_score(y_test, y_pred_rf_test_fusion)  # Calculating precision, recall, F1-score, and area under ROC curve for Random Forest on testing set
recall_rf_test = recall_score(y_test, y_pred_rf_test_fusion)
f1_score_rf_test = f1_score(y_test, y_pred_rf_test_fusion)
roc_auc_rf_test = roc_auc_score(y_test, y_pred_rf_test_fusion)
print("\nRandom Forest Metrics on Testing Set with Interpolative Fusion Output:")
print("Precision:", precision_rf_test)
print("Recall:", recall_rf_test)
print("F1-score:", f1_score_rf_test)
print("Area under ROC curve:", roc_auc_rf_test)


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
linear_reg = LinearRegression()          
linear_reg.fit(np.array(fusion_pred_train).reshape(-1, 1), y_train) # Train a Linear Regression model on the output from interpolative fusion
continuous_output_train = linear_reg.predict(np.array(fusion_pred_train).reshape(-1, 1))  # Get the continuous output from the Linear Regression model for both training and testing sets
continuous_output_test = linear_reg.predict(np.array(fusion_pred_test).reshape(-1, 1))
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)  # Train a Random Forest classifier on the output from interpolative fusion
rf_classifier.fit(np.array(fusion_pred_train).reshape(-1, 1), y_train)
y_pred_linear_regression_train = np.where(continuous_output_train > 0.5, 1, 0) # Make predictions using the Linear Regression model on the training and testing sets
y_pred_linear_regression_test = np.where(continuous_output_test > 0.5, 1, 0)
y_pred_rf_train = rf_classifier.predict(np.array(fusion_pred_train).reshape(-1, 1)) # Make predictions using the Random Forest classifier on the training and testing sets
y_pred_rf_test = rf_classifier.predict(np.array(fusion_pred_test).reshape(-1, 1))
X_train_stacking = np.column_stack((X_train, y_pred_linear_regression_train, y_pred_rf_train))  # Concatenate the predictions from both models with the original dataset
X_test_stacking = np.column_stack((X_test, y_pred_linear_regression_test, y_pred_rf_test))
log_reg_stacking = LogisticRegression()   # Train a logistic regression model on the concatenated features
log_reg_stacking.fit(X_train_stacking, y_train)
y_pred_stacking_test = log_reg_stacking.predict(X_test_stacking)  # Make predictions using the logistic regression model on the testing set
stacking_accuracy_test = accuracy_score(y_test, y_pred_stacking_test) # Calculate accuracy of the logistic regression stacking model on the testing set
print("Accuracy of Logistic Regression stacking model on Testing Set:", stacking_accuracy_test)
precision_stacking_test = precision_score(y_test, y_pred_stacking_test) # Calculate precision, recall, F1-score, and area under ROC curve for Logistic Regression stacking model
recall_stacking_test = recall_score(y_test, y_pred_stacking_test)
f1_score_stacking_test = f1_score(y_test, y_pred_stacking_test)
roc_auc_stacking_test = roc_auc_score(y_test, y_pred_stacking_test)
print("\nLogistic Regression Stacking Metrics on Testing Set:")
print("Precision:", precision_stacking_test)
print("Recall:", recall_stacking_test)
print("F1-score:", f1_score_stacking_test)
print("Area under ROC curve:", roc_auc_stacking_test)
