## Model Selection for Data

## Step 1: Load the Data

In [2]:
import pandas as pd

datasets = pd.read_csv('../Datasets/cancer_b.csv')
datasets.head()

Unnamed: 0,Id,Diagnosis,Radius (mean),Texture (mean),Perimeter (mean),Area (mean),Smoothness (mean),Compactness (mean),Concavity (mean),Concave points (mean),...,Radius (worst),Texture (worst),Perimeter (worst),Area (worst),Smoothness (worst),Compactness (worst),Concavity (worst),Concave points (worst),Symmetry (worst),Fractal dimension (worst)
0,8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,...,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
1,8510653,B,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,...,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
2,8510824,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,...,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
3,854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,...,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
4,85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,...,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409


## Step 2: Explore the Data

In [5]:
# Check for missing values
datasets.isnull().sum()

Id                           0
Diagnosis                    0
Radius (mean)                0
Texture (mean)               0
Perimeter (mean)             0
Area (mean)                  0
Smoothness (mean)            0
Compactness (mean)           0
Concavity (mean)             0
Concave points (mean)        0
Symmetry (mean)              0
Fractal dimension (mean)     0
Radius (se)                  0
Texture (se)                 0
Perimeter (se)               0
Area (se)                    0
Smoothness (se)              0
Compactness (se)             0
Concavity (se)               0
Concave points (se)          0
Symmetry (se)                0
Fractal dimension (se)       0
Radius (worst)               0
Texture (worst)              0
Perimeter (worst)            0
Area (worst)                 0
Smoothness (worst)           0
Compactness (worst)          0
Concavity (worst)            0
Concave points (worst)       0
Symmetry (worst)             0
Fractal dimension (worst)    0
dtype: i

In [6]:
# Basic statistics
datasets.describe()

Unnamed: 0,Id,Radius (mean),Texture (mean),Perimeter (mean),Area (mean),Smoothness (mean),Compactness (mean),Concavity (mean),Concave points (mean),Symmetry (mean),...,Radius (worst),Texture (worst),Perimeter (worst),Area (worst),Smoothness (worst),Compactness (worst),Concavity (worst),Concave points (worst),Symmetry (worst),Fractal dimension (worst)
count,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,...,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0
mean,26543820.0,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442
std,116739700.0,1.780512,3.995125,11.807438,134.287118,0.013446,0.03375,0.043442,0.015909,0.024807,...,1.981368,5.493955,13.527091,163.601424,0.020013,0.09218,0.140368,0.035797,0.041745,0.013804
min,8913.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1566,0.05521
25%,874662.0,11.08,15.15,70.87,378.2,0.08306,0.05562,0.02031,0.01502,0.158,...,12.08,19.58,78.27,447.1,0.1104,0.112,0.07708,0.05104,0.2406,0.07009
50%,908916.0,12.2,17.39,78.18,458.4,0.09076,0.07529,0.03709,0.02344,0.1714,...,13.35,22.82,86.92,547.4,0.1254,0.1698,0.1412,0.07431,0.2687,0.07712
75%,8812816.0,13.37,19.76,86.1,551.1,0.1007,0.09755,0.05999,0.03251,0.189,...,14.8,26.51,96.59,670.0,0.1376,0.2302,0.2216,0.09749,0.2983,0.08541
max,911320500.0,17.85,33.81,114.6,992.1,0.1634,0.2239,0.4108,0.08534,0.2743,...,19.82,41.78,127.1,1210.0,0.2006,0.5849,1.252,0.175,0.4228,0.1486


In [7]:
# Check class distribution
datasets['Diagnosis'].value_counts()

Diagnosis
B    357
Name: count, dtype: int64

## Step 3: Preprocess Data

In [9]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [10]:
# Encode target

In [11]:
le = LabelEncoder()

In [12]:
datasets['Diagnosis'] = le.fit_transform(datasets['Diagnosis'])  # B=0, M=1

In [52]:
# # Features and target
# X = datasets.drop(['Id', 'Diagnosis'], axis=1)
# y = datasets['Diagnosis']

In [53]:
# Make sure all columns except 'Id' and 'Diagnosis' are features
X = datasets.loc[:, datasets.columns != 'Diagnosis']
X = X.loc[:, X.columns != 'Id']  # drop 'Id'
y = datasets['Diagnosis']


In [54]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Step 4: Split Data

In [70]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Check test set class distribution
print(y_test.value_counts())


Diagnosis
0    72
Name: count, dtype: int64


In [74]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
print("CV Accuracy:", scores.mean())


CV Accuracy: 1.0


In [56]:
## Step 5: Choose a Model

In [75]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')


## Step 6: Train the Model

In [76]:
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Step 7: Evaluate the Model

In [77]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0
Confusion Matrix:
 [[72]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        72

    accuracy                           1.00        72
   macro avg       1.00      1.00      1.00        72
weighted avg       1.00      1.00      1.00        72





## Step 8: Tune the Model

In [78]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)


Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 1.0


## Step 9: Save the Model

In [79]:
import joblib

joblib.dump(model, "breast_cancer_model.pkl")


['breast_cancer_model.pkl']

## Step 10: Make Predictions

In [82]:
import joblib

joblib.dump(model, "breast_cancer_model.pkl")

loaded_model = joblib.load("breast_cancer_model.pkl")
new_predictions = loaded_model.predict(X_test)
print("New Predictions:", new_predictions)

New Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
