In [3]:
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load breast cancer dataset
cancer = datasets.load_breast_cancer()
data = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
target = cancer.target  # This contains the labels (0 or 1 for malignant or benign)

# Check for missing values (if any)
print(data.isnull().sum())

# If there are any missing values, impute them (we'll use mean imputation here)
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Feature scaling (important for algorithms like SVM and k-NN)
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_imputed), columns=data.columns)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data_scaled, target, test_size=0.3, random_state=42)

print("Data preprocessing completed.")


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64
Data preprocessing completed.


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize classifiers
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'k-NN': KNeighborsClassifier()
}

# Fit the models and evaluate
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Evaluate accuracy
    results[name] = accuracy

# Print accuracy results
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.4f}")


Logistic Regression: 0.9825
Decision Tree: 0.9240
Random Forest: 0.9649
SVM: 0.9708
k-NN: 0.9591


In [7]:
# Display model performance results
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

print("\nModel Comparison (Sorted by Accuracy):")
for name, accuracy in sorted_results:
    print(f"{name}: {accuracy:.4f}")

# Identify the best and worst performing models
best_model = sorted_results[0]
worst_model = sorted_results[-1]

print(f"\nBest Performing Model: {best_model[0]} with Accuracy: {best_model[1]:.4f}")
print(f"Worst Performing Model: {worst_model[0]} with Accuracy: {worst_model[1]:.4f}")



Model Comparison (Sorted by Accuracy):
Logistic Regression: 0.9825
SVM: 0.9708
Random Forest: 0.9649
k-NN: 0.9591
Decision Tree: 0.9240

Best Performing Model: Logistic Regression with Accuracy: 0.9825
Worst Performing Model: Decision Tree with Accuracy: 0.9240
