In [1]:
#  Import necessary libraries

from sklearn.datasets import load_breast_cancer  # Load the dataset
from sklearn.model_selection import train_test_split  # For splitting the data
import pandas as pd  # For creating and manipulating dataframes
import numpy as np  # For numerical operations

# Set a seed for reproducibility

SEED = 42  # Ensures that the splits are the same every time you run the code

# Load the breast cancer dataset

cancer_data = load_breast_cancer()

# Create a DataFrame with feature names

df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)

# Add the target variable to the DataFrame

df['target'] = cancer_data.target

# Split the data into training (60%), validation (20%), and test (20%) sets

train_df, temp_df = train_test_split(df, test_size=20, random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)

# Display the sizes of the resulting dataframes

print("Training set size:", train_df.shape)
print("Validation set size:", val_df.shape)
print("Test set size:", test_df.shape)

Training set size: (549, 31)
Validation set size: (10, 31)
Test set size: (10, 31)


In [3]:
test_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
204,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,...,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875,1
556,10.16,19.59,64.73,311.7,0.1003,0.07504,0.005025,0.01116,0.1791,0.06331,...,22.88,67.88,347.3,0.1265,0.12,0.01005,0.02232,0.2262,0.06742,1
83,19.1,26.29,129.1,1132.0,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,...,32.72,141.3,1298.0,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203,0
70,18.94,21.31,123.6,1130.0,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,...,26.58,165.9,1866.0,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589,0
81,13.34,15.86,86.49,520.0,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,...,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016,1
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
167,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,...,26.3,130.7,1260.0,0.1168,0.2119,0.2318,0.1474,0.281,0.07228,0
431,12.4,17.68,81.47,467.8,0.1054,0.1316,0.07741,0.02799,0.1811,0.07102,...,22.91,89.61,515.8,0.145,0.2629,0.2403,0.0737,0.2556,0.09359,1
101,6.981,13.43,43.79,143.5,0.117,0.07568,0.0,0.0,0.193,0.07818,...,19.54,50.41,185.2,0.1584,0.1202,0.0,0.0,0.2932,0.09382,1
511,14.81,14.7,94.66,680.7,0.08472,0.05016,0.03416,0.02541,0.1659,0.05348,...,17.58,101.7,760.2,0.1139,0.1011,0.1101,0.07955,0.2334,0.06142,1


In [23]:
test_df['classification'] = np.where(test_df['mean radius'] > 5, "Malignant", "Benign")


print(test_df[['mean radius', 'classification']].head(10)) 


     mean radius classification
204       12.470      Malignant
556       10.160      Malignant
83        19.100      Malignant
70        18.940      Malignant
81        13.340      Malignant
567       20.600      Malignant
167       16.780      Malignant
431       12.400      Malignant
101        6.981      Malignant
511       14.810      Malignant
Classification Accuracy: 0.6274


In [31]:
test_df['predicted'] = np.where(test_df['mean radius'] > 5, 1, 0)  # 1 = Malignant, 0 = Benign


accuracy = (test_df['predicted'] == test_df['target']).mean()


print(f"Test Set Accuracy: {accuracy:.4f}")

Test Set Accuracy: 0.6000


In [37]:

test_df['classification'] = np.where((test_df['mean radius'] > 2) & (test_df['mean texture'] < 8), "Benign", "Malignant")


print(test_df[['mean radius', 'mean texture', 'classification']].head(10))

     mean radius  mean texture classification
204       12.470         18.60      Malignant
556       10.160         19.59      Malignant
83        19.100         26.29      Malignant
70        18.940         21.31      Malignant
81        13.340         15.86      Malignant
567       20.600         29.33      Malignant
167       16.780         18.80      Malignant
431       12.400         17.68      Malignant
101        6.981         13.43      Malignant
511       14.810         14.70      Malignant


In [39]:

test_df['predicted'] = np.where((test_df['mean radius'] > 3) & (test_df['mean texture'] < 4), 0, 1)  # 0 = Benign, 1 = Malignant

accuracy = (test_df['predicted'] == test_df['target']).mean()

print(f"Test Set Accuracy: {accuracy:.4f}")


Test Set Accuracy: 0.6000


In [41]:
test_df['predicted'] = np.where((test_df['mean radius'] > 3) & (test_df['mean texture'] < 4), 0, 1)  # 0 = Benign, 1 = Malignant

# Calculate accuracy
accuracy = (test_df['predicted'] == test_df['target']).mean()

# Display accuracy
print(f"Test Set Accuracy: {accuracy:.4f}")

Test Set Accuracy: 0.6000


In [43]:
test_df['predicted'] = np.where((test_df['mean area'] < 100) & (test_df['mean perimeter'] < 100), 1, 0)  # 1 = Malignant, 0 = Benign


accuracy = (test_df['predicted'] == test_df['target']).mean()


print(f"Test Set Accuracy: {accuracy:.4f}")

Test Set Accuracy: 0.4000
