In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'breast-cancer-wisconsin-data' dataset.
Path to dataset files: /kaggle/input/breast-cancer-wisconsin-data


In [5]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the dataset
cancer = load_breast_cancer()

# Create DataFrame
df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target

print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [6]:
print(df.info()) # Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [7]:
X = df.drop('target', axis=1)
y = df['target']

In [9]:
y.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,357
0,212


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
X_train.shape, X_test.shape

((398, 30), (171, 30))

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# 1. Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# 2. Decision Tree
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train_scaled, y_train) # Can also use unscaled X_train

# 3. Support Vector Machine (SVM)
svm_clf = SVC(kernel='linear', random_state=42) # Start with a linear kernel
svm_clf.fit(X_train_scaled, y_train)

In [13]:
y_pred_log_reg = log_reg.predict(X_test_scaled)
y_pred_tree = tree_clf.predict(X_test_scaled)
y_pred_svm = svm_clf.predict(X_test_scaled)

In [14]:
from sklearn.metrics import classification_report, accuracy_score

print("--- Logistic Regression ---")
print(classification_report(y_test, y_pred_log_reg, target_names=cancer.target_names))

print("--- Decision Tree ---")
print(classification_report(y_test, y_pred_tree, target_names=cancer.target_names))

print("--- SVM ---")
print(classification_report(y_test, y_pred_svm, target_names=cancer.target_names))

--- Logistic Regression ---
              precision    recall  f1-score   support

   malignant       0.97      0.98      0.98        63
      benign       0.99      0.98      0.99       108

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

--- Decision Tree ---
              precision    recall  f1-score   support

   malignant       0.90      0.95      0.92        63
      benign       0.97      0.94      0.95       108

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.94       171
weighted avg       0.94      0.94      0.94       171

--- SVM ---
              precision    recall  f1-score   support

   malignant       0.97      0.97      0.97        63
      benign       0.98      0.98      0.98       108

    accuracy                           0.98       171
   macro avg       0.97      0.97      0.97       171
weighted avg

In [17]:
import joblib

# --- Save the Models ---
joblib.dump(log_reg, 'logistic_regression_model.pkl')
joblib.dump(tree_clf, 'decision_tree_model.pkl')
joblib.dump(svm_clf, 'svm_model_tuned.pkl')

# --- Save the Scaler (Only Once) ---
joblib.dump(scaler, 'scaler.pkl')

print("All three models and the scaler have been saved.")

All three models and the scaler have been saved.


In [18]:
import joblib

# Load all models
loaded_log_reg = joblib.load('logistic_regression_model.pkl')
loaded_tree_clf = joblib.load('decision_tree_model.pkl')
loaded_svm = joblib.load('svm_model_tuned.pkl')

# Load the scaler
loaded_scaler = joblib.load('scaler.pkl')

print("All models and scaler are loaded and ready.")

All models and scaler are loaded and ready.


In [19]:
# Get some new, raw data (using the test set as an example)
new_data = X_test.iloc[0:5]

# 1. Scale the data (ONCE) using the loaded scaler
new_data_scaled = loaded_scaler.transform(new_data)

# 2. Make predictions with each loaded model
pred_log_reg = loaded_log_reg.predict(new_data_scaled)
pred_tree = loaded_tree_clf.predict(new_data_scaled)
pred_svm = loaded_svm.predict(new_data_scaled)

print(f"Logistic Regression Predictions: {pred_log_reg}")
print(f"Decision Tree Predictions:       {pred_tree}")
print(f"SVM Predictions:                 {pred_svm}")

Logistic Regression Predictions: [1 0 0 1 1]
Decision Tree Predictions:       [1 0 0 1 1]
SVM Predictions:                 [1 0 0 1 1]


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# 1. Define the individual models that will be part of the ensemble
# Use the best parameters if you know them from tuning
clf1 = LogisticRegression(random_state=42)
clf2 = DecisionTreeClassifier(random_state=42)

# IMPORTANT: probability=True is required for 'soft' voting
clf3 = SVC(probability=True, C=10, kernel='rbf', random_state=42)
# Note: I've added C=10 and kernel='rbf' as an example from a typical tuning.
# You should use the results from your 'best_params_'.

In [21]:
# 2. Create a list of (name, model) tuples
estimators = [
    ('lr', clf1),
    ('dt', clf2),
    ('svm', clf3)
]

# 3. Create the VotingClassifier
# voting='soft' will average the probabilities.
# voting='hard' would take a majority vote.
ensemble_model = VotingClassifier(
    estimators=estimators,
    voting='soft'
)

# 4. Train the ensemble model just like any other model
# (Using the scaled training data)
ensemble_model.fit(X_train_scaled, y_train)

print("Ensemble model trained successfully.")

Ensemble model trained successfully.


In [22]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
y_pred_ensemble = ensemble_model.predict(X_test_scaled)

# Print the accuracy and classification report
print(f"Ensemble Model Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
print("\n--- Ensemble Model Classification Report ---")
print(classification_report(y_test, y_pred_ensemble, target_names=cancer.target_names))

Ensemble Model Accuracy: 0.9942

--- Ensemble Model Classification Report ---
              precision    recall  f1-score   support

   malignant       1.00      0.98      0.99        63
      benign       0.99      1.00      1.00       108

    accuracy                           0.99       171
   macro avg       1.00      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171



In [23]:
import joblib

# Assuming your trained model is named 'ensemble_model'
joblib.dump(ensemble_model, 'ensemble_model.pkl')

print("Ensemble model saved successfully as 'ensemble_model.pkl'")

Ensemble model saved successfully as 'ensemble_model.pkl'


In [24]:
import joblib

# Load the saved model and the scaler
loaded_ensemble = joblib.load('ensemble_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

print("Ensemble model and scaler loaded.")

# --- Example: Make a prediction ---
# (Using the same 'new_data' example as before)
new_data_scaled = loaded_scaler.transform(new_data)

# Predict using the loaded ensemble model
predictions = loaded_ensemble.predict(new_data_scaled)

print(f"Ensemble Model Predictions: {predictions}")

Ensemble model and scaler loaded.
Ensemble Model Predictions: [1 0 0 1 1]
