In [1]:
# Import the libraries

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import sklearn.metrics as skmet
import pickle


In [2]:
df = pd.read_csv('cancerdata.csv')
df.head()

Unnamed: 0,id,diagnosis,Sex,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,F,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,M,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,M,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,F,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,F,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   Sex                569 non-null    object 
 3   radius_mean        569 non-null    float64
 4   texture_mean       569 non-null    float64
 5   perimeter_mean     569 non-null    float64
 6   area_mean          569 non-null    float64
 7   smoothness_mean    569 non-null    float64
 8   compactness_mean   569 non-null    float64
 9   concavity_mean     567 non-null    float64
 10  points_mean        568 non-null    float64
 11  symmetry_mean      568 non-null    float64
 12  dimension_mean     569 non-null    float64
 13  radius_se          569 non-null    float64
 14  texture_se         569 non-null    float64
 15  perimeter_se       569 non-null    float64
 16  area_se            569 non

In [4]:
# Data Preprocessing & EDA
# converting B to Benign and M to Malignant 
df['diagnosis'] = np.where(df['diagnosis'] == 'B', 'Benign', df['diagnosis'])
df['diagnosis'] = np.where(df['diagnosis'] == 'M', 'Malignant', df['diagnosis'])

In [5]:
df.drop(['id'], axis = 1, inplace = True) # Excluding id column

In [6]:
df.describe()


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,567.0,568.0,568.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088911,0.048846,0.181161,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.079837,0.038797,0.027438,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02952,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.033455,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1313,0.07373,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [7]:
# Seperating input and output variables 
df_X = pd.DataFrame(df.iloc[:, 1:])
df_y = pd.DataFrame(df.iloc[:, 0])


In [8]:
# All numeric features
numeric_features = df_X.select_dtypes(exclude = ['object']).columns

numeric_features


Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean', 'points_mean',
       'symmetry_mean', 'dimension_mean', 'radius_se', 'texture_se',
       'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se',
       'concavity_se', 'points_se', 'symmetry_se', 'dimension_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'points_worst', 'symmetry_worst', 'dimension_worst'],
      dtype='object')

In [9]:
# Imputation strategy for numeric columns
num_pipeline = Pipeline([('impute', SimpleImputer(strategy = 'mean'))])



In [10]:
# All categorical features
categorical_features = df_X.select_dtypes(include = ['object']).columns


In [11]:
from sklearn_pandas import DataFrameMapper

# DataFrameMapper is used to map the given Attribute
# Encoding categorical to numeric variable
categ_pipeline = Pipeline([('label', DataFrameMapper([(categorical_features,
                                                       OneHotEncoder(drop = 'first'))]))])


In [12]:
# Using ColumnTransfer to transform the columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.
preprocess_pipeline = ColumnTransformer([('categorical', categ_pipeline, categorical_features), 
                                       ('numerical', num_pipeline, numeric_features)])


In [13]:
processed = preprocess_pipeline.fit(df_X)  # Pass the raw data through pipeline

processed


In [14]:
# Save the defined pipeline
import joblib
joblib.dump(processed, 'processed1')


['processed1']

In [15]:
# Transform the original data using the pipeline defined above
clean = pd.DataFrame(processed.transform(df_X), columns = df_X.columns)  # Clean and processed data for Clustering

clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sex                569 non-null    float64
 1   radius_mean        569 non-null    float64
 2   texture_mean       569 non-null    float64
 3   perimeter_mean     569 non-null    float64
 4   area_mean          569 non-null    float64
 5   smoothness_mean    569 non-null    float64
 6   compactness_mean   569 non-null    float64
 7   concavity_mean     569 non-null    float64
 8   points_mean        569 non-null    float64
 9   symmetry_mean      569 non-null    float64
 10  dimension_mean     569 non-null    float64
 11  radius_se          569 non-null    float64
 12  texture_se         569 non-null    float64
 13  perimeter_se       569 non-null    float64
 14  area_se            569 non-null    float64
 15  smoothness_se      569 non-null    float64
 16  compactness_se     569 non

In [16]:
# new_features = cancerclean.select_dtypes(exclude = ['object']).columns 
# new_features


In [17]:
# Define scaling pipeline
scale_pipeline = Pipeline([('scale', MinMaxScaler())])



In [18]:
preprocess_pipeline2 = ColumnTransformer([('scale', scale_pipeline, clean.columns)]) 

processed2 = preprocess_pipeline2.fit(clean)
processed2


In [19]:
# Save the Scaling pipeline
joblib.dump(processed2, 'processed2')


['processed2']

In [20]:
# Normalized data frame (considering the numerical part of data)

clean_n = pd.DataFrame(processed2.transform(clean), columns = clean.columns)


In [21]:
clean_n.describe()

Unnamed: 0,Sex,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.467487,0.338222,0.323965,0.332935,0.21692,0.394785,0.260601,0.20832,0.242772,0.379601,...,0.296663,0.363998,0.283138,0.170906,0.404138,0.220212,0.217403,0.393836,0.263307,0.189596
std,0.499381,0.166787,0.145453,0.167915,0.149274,0.126967,0.161992,0.18673,0.19266,0.138456,...,0.17194,0.163813,0.167352,0.139932,0.150779,0.152649,0.166633,0.225884,0.121954,0.118466
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.223342,0.218465,0.216847,0.117413,0.304595,0.139685,0.06926,0.100944,0.282323,...,0.180719,0.241471,0.167837,0.08113,0.300007,0.116337,0.091454,0.223127,0.185098,0.1077
50%,0.0,0.302381,0.308759,0.293345,0.172895,0.390358,0.224679,0.144213,0.166501,0.369697,...,0.250445,0.356876,0.23532,0.123206,0.397081,0.17911,0.18107,0.343402,0.247782,0.163977
75%,1.0,0.416442,0.40886,0.416765,0.271135,0.47549,0.340531,0.306232,0.366004,0.45303,...,0.386339,0.471748,0.373475,0.220901,0.494156,0.30252,0.305831,0.554639,0.318155,0.242949
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Separating the input and output from the dataset
# X = np.array(clean_n.iloc[:, :]) # Predictors 
Y = np.array(df_y['diagnosis']) # Target


In [23]:
X_train, X_test, y_train, y_test = train_test_split(clean_n, Y,
                                                    test_size = 0.2, random_state = 0)

X_train.shape
X_test.shape


(114, 31)

In [24]:
#Model Building
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_default = RandomForestClassifier(random_state=42)
rf_default.fit(X_train, y_train)

In [25]:
y_pred_default = rf_default.predict(X_test)

In [26]:
# Evaluate the default model
from sklearn.metrics import accuracy_score, classification_report
print("Performance of RandomForest without Hyperparameter Tuning:")
print("Accuracy:", accuracy_score(y_test, y_pred_default))
print("Classification Report:\n", classification_report(y_test, y_pred_default))

Performance of RandomForest without Hyperparameter Tuning:
Accuracy: 0.956140350877193
Classification Report:
               precision    recall  f1-score   support

      Benign       0.95      0.99      0.97        77
   Malignant       0.97      0.89      0.93        37

    accuracy                           0.96       114
   macro avg       0.96      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



In [27]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],                   # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 40],                   # Maximum depth of each tree
    'min_samples_split': [2, 5, 10, 15],                   # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6],                      # Minimum samples required to be a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],              # Number of features to consider for split
    'bootstrap': [True, False],                            # Use bootstrap samples
    'criterion': ['gini', 'entropy'],                      # Split criteria
    'class_weight': [None, 'balanced', 'balanced_subsample']  # Class weighting
}

In [28]:
# Set up RandomizedSearchCV with 100 random parameter combinations to test
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), 
                                   param_distributions=param_grid, 
                                   n_iter=100,           # Number of random parameter combinations to try
                                   cv=5,                 # 5-fold cross-validation
                                   scoring='accuracy', 
                                   n_jobs=-1, 
                                   verbose=2,
                                   random_state=42)

In [29]:
# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


190 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
116 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterE

In [30]:
# Print the best parameters and best score
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best Cross-Validated Accuracy:", random_search.best_score_)

Best Parameters from RandomizedSearchCV: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'entropy', 'class_weight': 'balanced_subsample', 'bootstrap': False}
Best Cross-Validated Accuracy: 0.9626373626373625


In [31]:
#Evaluate the tuned model
best_rf = random_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)


In [32]:
print("\nPerformance of RandomForest with Hyperparameter Tuning:")
print("Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report:\n", classification_report(y_test, y_pred_tuned))


Performance of RandomForest with Hyperparameter Tuning:
Accuracy: 0.9649122807017544
Classification Report:
               precision    recall  f1-score   support

      Benign       0.95      1.00      0.97        77
   Malignant       1.00      0.89      0.94        37

    accuracy                           0.96       114
   macro avg       0.98      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114

