# Classification Problem

The objective is to apply supervised learning techniques to a real-world dataset.

#  Loading and Preprocessing 

In [72]:
# Loading the  Necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import PowerTransformer,StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [13]:
# Loading the dataset
data = load_breast_cancer()
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [14]:
# Create DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target 
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [18]:
# Check for missing values
missing = df.isnull().sum()
missing

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

Dataset has no missing value

# Feature Scaling
Features like 'mean radius', 'mean area', and 'mean smoothness' have different units and ranges.
here,we using StandardScaler for scaling

In [19]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']
X,y

(     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 0          17.99         10.38          122.80     1001.0          0.11840   
 1          20.57         17.77          132.90     1326.0          0.08474   
 2          19.69         21.25          130.00     1203.0          0.10960   
 3          11.42         20.38           77.58      386.1          0.14250   
 4          20.29         14.34          135.10     1297.0          0.10030   
 ..           ...           ...             ...        ...              ...   
 564        21.56         22.39          142.00     1479.0          0.11100   
 565        20.13         28.25          131.20     1261.0          0.09780   
 566        16.60         28.08          108.30      858.1          0.08455   
 567        20.60         29.33          140.10     1265.0          0.11780   
 568         7.76         24.54           47.92      181.0          0.05263   
 
      mean compactness  mean concavity  mean conca

In [23]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train,X_test

(     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 284        12.89         15.70           84.08      516.6          0.07818   
 118        15.78         22.91          105.70      782.6          0.11550   
 230        17.05         19.08          113.40      895.0          0.11410   
 357        13.87         16.21           88.52      593.7          0.08743   
 438        13.85         19.60           88.68      592.6          0.08684   
 ..           ...           ...             ...        ...              ...   
 303        10.49         18.61           66.86      334.3          0.10680   
 5          12.45         15.70           82.57      477.1          0.12780   
 479        16.25         19.51          109.80      815.8          0.10260   
 14         13.73         22.61           93.60      578.3          0.11310   
 431        12.40         17.68           81.47      467.8          0.10540   
 
      mean compactness  mean concavity  mean conca

In [27]:
# applying standardscaler on data
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)

In [28]:
scaled_X_test = scaler.transform(X_test)

In [29]:
scaled_X_test

array([[-1.23552028, -0.2350842 , -1.2731732 , ..., -1.75082069,
        -1.60480623, -1.01012747],
       [-0.59374002, -0.30314643, -0.6523471 , ..., -1.32709652,
        -1.18261089, -0.75268322],
       [ 0.37031952,  0.43615019,  0.4312693 , ...,  1.48318281,
         0.15810102,  1.10166939],
       ...,
       [-0.11865592, -0.30784038, -0.16611415, ..., -0.83866197,
        -0.89416446, -0.76507071],
       [-0.26590421,  2.07433761, -0.29706966, ..., -0.5448132 ,
        -0.69595825, -1.07260348],
       [ 0.63703339,  0.20849377,  0.68994684, ...,  1.33315172,
         0.24028408,  0.58408796]])

# LogisticRegression

Logistic Regression is a supervised machine learning algorithm used for classification tasks — especially binary classification
Logistic Regression predicts the probability of an instance belonging to a class 
* It calculates a linear combination of the input features
* Applies the sigmoid function to convert it into a probability between 0 and 1
* Uses a threshold (usually 0.5) to assign the class

  Why LogisticRegression for this dataset?
* Breast cancer dataset is binary classification problem
* Logistic Regression gives coefficients that indicate how each features affects the cancer risk.
* Logistic Regression performs well when the data is linearly separable,It's fast, efficient, and less prone to overfitting when features are well-       behaved
* Unlike hard classification, Logistic Regression gives probabilities.


In [34]:
model = LogisticRegression()
model.fit(scaled_X_train,y_train)

In [39]:
result = model.predict(scaled_X_test)
result,y_test

(array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0]),
 192    1
 327    1
 417    0
 297    0
 34     0
       ..
 423    1
 84     1
 149    1
 543    1
 177    0
 Name: target, Length: 114, dtype: int32)

In [46]:
print("LogisticRegression Perfomance: ")
print("Confusion matrix :",confusion_matrix(y_test, result),"\n \n")
print("Classification Report: ",classification_report(y_test, result)," \n")
print("Accuracy score: ",accuracy_score(y_test, result))

LogisticRegression Perfomance: 
Confusion matrix : [[43  3]
 [ 0 68]] 
 

Classification Report:                precision    recall  f1-score   support

           0       1.00      0.93      0.97        46
           1       0.96      1.00      0.98        68

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114
  

Accuracy score:  0.9736842105263158


# Decision Tree Classifier

A Decision Tree Classifier is a flowchart-like structure used for classification. It splits the dataset into branches based on feature values to predict a target class.

1. Choose the Best Feature to Split OnIt selects the feature that provides the best separation between the classes using a metric like:
  * Gini Impurity
  * Entropy / Information Gain

2. Split the Data Based on Feature Thresholds
3. Repeat Recursively
    This process continues, splitting further into branches until:
    All samples in a node belong to the same class or a stopping condition is met (e.g., max depth, min samples)
4. Prediction
   For a new data point, it follows the splits (decisions) down the tree to reach a leaf node, which gives the predicted class.

Why DecisionTreeClassifier for breast cancer dataset
*  Handles numerical features:	The dataset has 30 numerical features
*  No feature scaling needed:	Decision Trees work with raw (unscaled) data
*  Captures non-linear patterns:	Some features may have complex interactions
*  Interpretable:	We can visualize how a decision is made
*  Handles feature interactions:	Breast cancer features like texture + area may interact
*  Works with clean datasets	Sklearn’s dataset has no missing values

In [51]:
# Creating and Training the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

In [54]:
# Prediction
y_pred = model.predict(X_test)
y_pred,y_test

(array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
        1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0]),
 192    1
 327    1
 417    0
 297    0
 34     0
       ..
 423    1
 84     1
 149    1
 543    1
 177    0
 Name: target, Length: 114, dtype: int32)

In [58]:
print("DecisionTreeClassifier:")
print("Accuracy score:",accuracy_score(y_test,y_pred),"\n")
print("Confusion matrix:",confusion_matrix(y_test,y_pred),"\n")
print("Classification Report:",classification_report(y_test,y_pred))

DecisionTreeClassifier:
Accuracy score: 0.9298245614035088 

Confusion matrix: [[39  7]
 [ 1 67]] 

Classification Report:               precision    recall  f1-score   support

           0       0.97      0.85      0.91        46
           1       0.91      0.99      0.94        68

    accuracy                           0.93       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.93      0.93      0.93       114



# Random Forest Classifier

Random Forest is an ensemble learning method that builds multiple Decision Trees and combines their results to make better, more robust predictions.
Working:
1.Bootstrapping the Data
  * It creates multiple random subsets (with replacement) from the original dataset — this is called bootstrap sampling.
2.Building Multiple Decision Trees
  * For each subset, a Decision Tree is trained — but:
     * It uses a random subset of features at each split.
     * Each tree is different, introducing diversity.
3.Combining Predictions
  * For classification: uses majority voting across all trees.
  * For regression: averages the predictions.



In [61]:
# Creating and training the model
model = RandomForestClassifier()
model.fit(scaled_X_train,y_train)

In [64]:
# Predict
y_predict = model.predict(scaled_X_test)
y_predict,y_test

(array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0]),
 192    1
 327    1
 417    0
 297    0
 34     0
       ..
 423    1
 84     1
 149    1
 543    1
 177    0
 Name: target, Length: 114, dtype: int32)

In [65]:
print("RandomForestClassifier:")
print("Accuracy score:",accuracy_score(y_test,y_predict),"\n")
print("Confusion matrix:",confusion_matrix(y_test,y_predict),"\n")
print("Classification Report:",classification_report(y_test,y_predict))

RandomForestClassifier:
Accuracy score: 0.956140350877193 

Confusion matrix: [[41  5]
 [ 0 68]] 

Classification Report:               precision    recall  f1-score   support

           0       1.00      0.89      0.94        46
           1       0.93      1.00      0.96        68

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



# SupportVectorMachine for Classification

SupporVectoeMachine (SVM) is supervised learining algorithm, used for classification and regression (SVR)
 How SVM Works
1. Goal: Find the best boundary (hyperplane) that separates classes with the maximum margin.
2. Support Vectors
    The data points closest to the hyperplane,These points "support" the margin — if they change, the boundary changes.3.3.Maximize the Margin
3. SVM tries to:
    * Maximize the distance between the hyperplane and the support vectors
    * Maximize the distance between the hyperplane and the support vectors
    * This makes the model more robust and less likely to overfit
4. Handles Non-Linearly Separable Data
    If the data isn’t separable in original dimensions, SVM uses a kernel trick to project it into a higher-dimensional space
    Common kernels:
    * linear
    * rbf (radial basis function / Gaussian)
    * poly (polynomial)


Why SVM is Suitable for the Breast Cancer Dataset
* Binary classification	:Predicts malignant (0) vs benign (1)
* Works well with high-dimensional data:	The dataset has 30 numerical features
* Effective for margin-based separation:	Cancer data often has clear boundaries
* Robust to outliers (soft margin):	Can tolerate some misclassification
* Kernel trick enables non-linear boundaries:	Helps model complex relationships
* Good generalization:	High accuracy on both training & test sets

In [67]:
# Creating nad training the model
model = SVC()
model.fit(scaled_X_train,y_train)

In [70]:
# predicting
y_result = model.predict(scaled_X_test)
y_result,y_test

(array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0]),
 192    1
 327    1
 417    0
 297    0
 34     0
       ..
 423    1
 84     1
 149    1
 543    1
 177    0
 Name: target, Length: 114, dtype: int32)

In [71]:
print("SVC:")
print("Accuracy score:",accuracy_score(y_test,y_result),"\n")
print("Confusion matrix:",confusion_matrix(y_test,y_result),"\n")
print("Classification Report:",classification_report(y_test,y_result))

SVC:
Accuracy score: 0.9736842105263158 

Confusion matrix: [[43  3]
 [ 0 68]] 

Classification Report:               precision    recall  f1-score   support

           0       1.00      0.93      0.97        46
           1       0.96      1.00      0.98        68

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



# KNN (K-Nearest Neighbors) 

 KNN is a instance-based learning algorithm used for classification and regression.
 How KNN Works
1. Choose a value of k 
2. For a new data point:
   *  Measure the distance (usually Euclidean) to all points in the training set.
3. Find the k nearest neighbors (closest data points).
4. Take a majority vote (for classification) or average (for regression).
5. Assign the class label based on the majority.

In [74]:
# Creating and training dataset
model = KNeighborsClassifier(n_neighbors=5)
model.fit(scaled_X_train,y_train)

In [76]:
# Prediction
knn_pred = model.predict(scaled_X_test)
knn_pred,y_test

(array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0]),
 192    1
 327    1
 417    0
 297    0
 34     0
       ..
 423    1
 84     1
 149    1
 543    1
 177    0
 Name: target, Length: 114, dtype: int32)

In [77]:
print("KNN:")
print("Accuracy score:",accuracy_score(y_test,knn_pred),"\n")
print("Confusion matrix:",confusion_matrix(y_test,knn_pred),"\n")
print("Classification Report:",classification_report(y_test,knn_pred))

KNN:
Accuracy score: 0.9473684210526315 

Confusion matrix: [[40  6]
 [ 0 68]] 

Classification Report:               precision    recall  f1-score   support

           0       1.00      0.87      0.93        46
           1       0.92      1.00      0.96        68

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



#  Model Comparison

In [78]:
# Define models
models = {
      "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf'),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}



In [80]:
# Compare models
results = {}

for name, model in models.items():
    model.fit(scaled_X_train, y_train)
    y_pred = model.predict(scaled_X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"🔹 {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=data.target_names))
    print('-' * 50)


🔹 Logistic Regression
Accuracy: 0.9737
              precision    recall  f1-score   support

   malignant       1.00      0.93      0.97        46
      benign       0.96      1.00      0.98        68

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

--------------------------------------------------
🔹 Decision Tree
Accuracy: 0.9298
              precision    recall  f1-score   support

   malignant       0.97      0.85      0.91        46
      benign       0.91      0.99      0.94        68

    accuracy                           0.93       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.93      0.93      0.93       114

--------------------------------------------------
🔹 Random Forest
Accuracy: 0.9474
              precision    recall  f1-score   support

   malignant       1.00      0.87      0.93        46
      benign       0.92      1

Logistic Regression and SVC produce hight accuracy output.
Decision Tree produce comparatlvely less accuracy result.