In [81]:
# Importing necessary models
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn import datasets
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer


In [82]:
# Load the breast cancer dataset
cancer = load_breast_cancer()

# Create a DataFrame for the features using the `.data` array
df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

# Add the target variable as a new column to the DataFrame
df['target'] = cancer.target

# Display the first five rows of the final DataFrame
print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [83]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [84]:
df.isnull().sum()

Unnamed: 0,0
mean radius,0
mean texture,0
mean perimeter,0
mean area,0
mean smoothness,0
mean compactness,0
mean concavity,0
mean concave points,0
mean symmetry,0
mean fractal dimension,0


In [85]:
df.dtypes

Unnamed: 0,0
mean radius,float64
mean texture,float64
mean perimeter,float64
mean area,float64
mean smoothness,float64
mean compactness,float64
mean concavity,float64
mean concave points,float64
mean symmetry,float64
mean fractal dimension,float64


In [86]:
y = df['target']
X = df.drop('target', axis=1)

In [87]:
# Class for cleaning data
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class cleaning_data(BaseEstimator, TransformerMixin):
  def __init__ (self, num_strategy = "mean", cat_strategy = "mode"):
    self.num_strategy = num_strategy
    self.cat_strategy = cat_strategy

  def fit(self, X, y = None):
    numericals = X.select_dtypes(include = ['int64', 'float64'])

    if self.num_strategy == 'mean':
      self.fill_numericals_ = numericals.mean()

    elif self.num_strategy == 'median':
      self.fill_numericals_ = numericals.median()

    else:
      raise ValueError("The strategy of filling can only be mean or median")


    categoricals = X.select_dtypes(include = ['object', 'category'])
    if not categoricals.empty: # Check if there are categorical columns
      if self.cat_strategy == 'mode':
        self.fill_categorical_ = categoricals.mode().iloc[0] # mode can return multiple values, take the first one

      else:
        raise ValueError("Can only be filled with the mode")
    else:
      self.fill_categorical_ = None # Set to None if no categorical columns
    return self

  def transform(self, X):
    X = X.copy()
    # Apply numerical imputation
    if hasattr(self, 'fill_numericals_'):
      for col, val in self.fill_numericals_.items():
          X[col] = X[col].fillna(val)

    # Apply categorical imputation only if fill_categorical_ is not None
    if self.fill_categorical_ is not None:
        for col, val in self.fill_categorical_.items():
            X[col] = X[col].fillna(val)
    return X

In [89]:
numeric_features = df.select_dtypes(['int64','float64']).columns.drop('target') # Extract column names and drop the target

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
    ]
)

pipeline = Pipeline([
    ("cleaner", cleaning_data()),    # <- your custom class
    ("preprocess", preprocessor),
    ("model", LogisticRegression())
])

# -------------------------
# Step 4: Train & Evaluate
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipeline.fit(X_train, y_train)
print("Test accuracy:", pipeline.score(X_test, y_test))

Test accuracy: 0.9824561403508771


In [90]:
# Cross-validation check
scores = cross_val_score(pipeline, X, y, cv=5)
print("CV Accuracy:", scores.mean())

CV Accuracy: 0.9806862288464524
