In [29]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [34]:
df = pd.read_csv("breast-cancer.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [35]:
# Data Preprocessing
df['diagnosis'] = df['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)
df = df.drop_duplicates()
df.drop(['id'], axis=1).head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [36]:
# Data splitting
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [37]:
# Training models
RF = RandomForestClassifier(random_state=42)
RF.fit(X_train,y_train)
RF_P = RF.predict(X_test)

ADA = AdaBoostClassifier(random_state=42)
ADA.fit(X_train,y_train)
ADA_P = ADA.predict(X_test)

XGB = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
XGB.fit(X_train,y_train)
XGB_P = XGB.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [43]:
print("Random Forest Accuracy:", accuracy_score(y_test, RF_P))
print("AdaBoost Accuracy:", accuracy_score(y_test, ADA_P))
print("XGBoost Accuracy:", accuracy_score(y_test, XGB_P))

print("\nRandom Forest:\n", classification_report(y_test, RF_P))
print("\nAdaBoost:\n", classification_report(y_test, ADA_P))
print("\nXGBoost:\n", classification_report(y_test, XGB_P))

Random Forest Accuracy: 0.9766081871345029
AdaBoost Accuracy: 0.9824561403508771
XGBoost Accuracy: 0.9707602339181286

Random Forest:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       108
           1       0.98      0.95      0.97        63

    accuracy                           0.98       171
   macro avg       0.98      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171


AdaBoost:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       108
           1       0.98      0.97      0.98        63

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171


XGBoost:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       108
           1       0.97      0.95      0.96        63

    accuracy            

In [None]:
#1. Load the datset
#2. Data Preprocessing, id column is removed because of it's no use and replaced M with 1 and B with 0 in diagnosis column
#3. Data Splitting, using TrainTestSplit
#4. Train the models(RandomForestClassifier, AdaBoostClassifier, XGBClassifier)
#5. AdaBosst has highest Accuracy score
#6. AdaBoost is dominant with high Macro average and weighted avg 
# AdaBoost is the best performer among the three for this classification task

In [44]:
# ENSEMBLE LEARNING: Ensemble learning is a technique where multiple models (learners) are combined to solve a problem and improve performance compared to a single model.
####   BAGGING: Train multiple models independently in parallel on random subsets of the data (with replacement).
# Each model gives a prediction, and the final output is:
# Majority vote for classification
# Average for regression
# Random Forest is a popular example of a bagging-based algorithm.
####   BOOSTING: Train models sequentially — each new model learns from the errors of the previous one.
# The final prediction is made by combining the outputs of all weak learners, often using weighted voting or a sum of their predictions.
# Popular boosting algorithms include AdaBoost, Gradient Boosting, and XGBoost.