In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt


data = pd.read_csv('data.csv')

data = data.drop('id', axis=1)


le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']


X = X.fillna(X.mean())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Define the base model (DecisionTreeClassifier)
base_model = DecisionTreeClassifier(max_depth=1, random_state=42)
base_model.fit(X_train, y_train)

# Define AdaBoost classifier using the base model
adaboost = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    random_state=42)

adaboost.fit(X_train, y_train)

base_pred = base_model.predict(X_test)
base_prob = base_model.predict_proba(X_test)[:, 1]


ada_pred = adaboost.predict(X_test)
ada_prob = adaboost.predict_proba(X_test)[:, 1]


base_accuracy = accuracy_score(y_test, base_pred)
base_roc_auc = roc_auc_score(y_test, base_prob)

ada_accuracy = accuracy_score(y_test, ada_pred)
ada_roc_auc = roc_auc_score(y_test, ada_prob)

metrics = {
    'Model': ['Base Decision Stump', 'AdaBoost (50 stumps)'],
    'Accuracy': [base_accuracy, ada_accuracy],
    'ROC-AUC': [base_roc_auc, ada_roc_auc]
}

metrics_df = pd.DataFrame(metrics)
print("Performance Comparison:")
print(metrics_df)


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.bar(metrics_df['Model'], metrics_df['Accuracy'], color=['blue', 'orange'])
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.set_ylim(0, 1)


ax2.bar(metrics_df['Model'], metrics_df['ROC-AUC'], color=['blue', 'orange'])
ax2.set_title('Model ROC-AUC Comparison')
ax2.set_ylabel('ROC-AUC Score')
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [18]:
!pip install --upgrade scikit-learn




In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt

# Load data
data = pd.read_csv('data.csv')

# Drop 'id' column as it is not useful
data = data.drop('id', axis=1)

# Check for unique values in the diagnosis column
print("Unique values in 'diagnosis':", data['diagnosis'].unique())

# Label encode the diagnosis column
le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])

# Separate features and target
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Handle missing values (if any) by filling with mean
X = X.apply(pd.to_numeric, errors='coerce')  # Ensure numeric values
X = X.fillna(X.mean())  # Fill missing values with column mean

# Check if there are any missing values after filling
print("Missing values after filling:", X.isnull().sum().sum())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize the base model (DecisionTreeClassifier)
base_model = DecisionTreeClassifier(max_depth=1, random_state=42)
base_model.fit(X_train, y_train)

# Initialize AdaBoost with the base model
adaboost = AdaBoostClassifier(base_estimator=base_model, n_estimators=50, random_state=42)
adaboost.fit(X_train, y_train)

# Make predictions with both models
base_pred = base_model.predict(X_test)
base_prob = base_model.predict_proba(X_test)[:, 1]

ada_pred = adaboost.predict(X_test)
ada_prob = adaboost.predict_proba(X_test)[:, 1]

# Calculate accuracy and ROC-AUC scores for both models
base_accuracy = accuracy_score(y_test, base_pred)
base_roc_auc = roc_auc_score(y_test, base_prob)

ada_accuracy = accuracy_score(y_test, ada_pred)
ada_roc_auc = roc_auc_score(y_test, ada_prob)

# Create a comparison dataframe
metrics = {
    'Model': ['Base Decision Stump', 'AdaBoost (50 stumps)'],
    'Accuracy': [base_accuracy, ada_accuracy],
    'ROC-AUC': [base_roc_auc, ada_roc_auc]
}

metrics_df = pd.DataFrame(metrics)
print("Performance Comparison:")
print(metrics_df)

# Plotting the comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Accuracy comparison
ax1.bar(metrics_df['Model'], metrics_df['Accuracy'], color=['blue', 'orange'])
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.set_ylim(0, 1)

# ROC-AUC comparison
ax2.bar(metrics_df['Model'], metrics_df['ROC-AUC'], color=['blue', 'orange'])
ax2.set_title('Model ROC-AUC Comparison')
ax2.set_ylabel('ROC-AUC Score')
ax2.set_ylim(0, 1)

# Display the plots
plt.tight_layout()
plt.show()


Unique values in 'diagnosis': ['M' 'B']
Missing values after filling: 569


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv('data.csv')

# Drop the 'id' column
data = data.drop('id', axis=1)

# Encode the 'diagnosis' column ('M' = malignant, 'B' = benign)
le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])

# Handle missing values: Fill with the median value for numeric columns
data = data.fillna(data.median())

# Check for remaining missing values
print(f"Missing values after filling: {data.isnull().sum().sum()}")  # Should be 0

# Check for any infinite values
if np.any(np.isinf(data.values)):
    print("Warning: Dataset contains infinite values.")
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.fillna(data.median(), inplace=True)

# Split the dataset into features (X) and target (y)
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Define the base model (DecisionTreeClassifier)
base_model = DecisionTreeClassifier(max_depth=1, random_state=42)

# Initialize AdaBoost with the base model (DecisionTreeClassifier)
adaboost = AdaBoostClassifier(base_estimator=base_model, n_estimators=50, random_state=42)

# Train the base model
base_model.fit(X_train, y_train)

# Train the AdaBoost model
adaboost.fit(X_train, y_train)

# Make predictions with both models
base_pred = base_model.predict(X_test)
base_prob = base_model.predict_proba(X_test)[:, 1]

ada_pred = adaboost.predict(X_test)
ada_prob = adaboost.predict_proba(X_test)[:, 1]

# Evaluate performance
base_accuracy = accuracy_score(y_test, base_pred)
base_roc_auc = roc_auc_score(y_test, base_prob)

ada_accuracy = accuracy_score(y_test, ada_pred)
ada_roc_auc = roc_auc_score(y_test, ada_prob)

# Display performance metrics
metrics = {
    'Model': ['Base Decision Stump', 'AdaBoost (50 stumps)'],
    'Accuracy': [base_accuracy, ada_accuracy],
    'ROC-AUC': [base_roc_auc, ada_roc_auc]
}

metrics_df = pd.DataFrame(metrics)
print("Performance Comparison:")
print(metrics_df)

# Plot comparison of model performance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.bar(metrics_df['Model'], metrics_df['Accuracy'], color=['blue', 'orange'])
ax1.set_title('Model Accuracy Comparison')
ax1.set_ylabel('Accuracy')
ax1.set_ylim(0, 1)

ax2.bar(metrics_df['Model'], metrics_df['ROC-AUC'], color=['blue', 'orange'])
ax2.set_title('Model ROC-AUC Comparison')
ax2.set_ylabel('ROC-AUC Score')
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()


Missing values after filling: 569


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'