In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
df = pd.read_csv('./data/breast_cancer_data.csv')
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
X = df.drop('target',axis=1)
X.head(2)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [4]:
Y=df['target']
Y.head(2)

0    0
1    0
Name: target, dtype: int64

In [5]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.decomposition import PCA # Applying PCA Algorithms
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=2))
    ]

)
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,X.columns),
])

In [7]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [8]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [9]:
X_train.head(5)

Unnamed: 0,num_pipeline__pca0,num_pipeline__pca1
0,-0.445946,-1.338358
1,-3.916035,-0.279343
2,7.18873,10.240651
3,-3.456324,0.062784
4,-0.140778,2.359469


In [10]:
X_test.head(5)

Unnamed: 0,num_pipeline__pca0,num_pipeline__pca1
0,1.375041,-2.462796
1,6.302611,5.110256
2,-3.170007,1.233102
3,1.678926,2.331138
4,-2.094278,1.950936


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [12]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
        report = {}
        for i in range(len(models)):
            model = list(models.values())[i]
            # Train model
            model.fit(X_train,y_train)

            # Predict Testing data
            y_test_pred =model.predict(X_test)
            accuracy_score1 = accuracy_score(y_test,y_test_pred)  
            # print('Model Training Performance')
            # print(classification_report(y_test,y_pred))
            # print(confusion_matrix(y_test,y_pred))
            # print(accuracy_score(y_test,y_pred))         
            report[list(models.keys())[i]] =  accuracy_score1

        return report

In [15]:
## Train multiple models
## Model Ecaluation
# Create a dictionary of models
models = {
    'LogisticRegressor': LogisticRegression(),
    'SVC': SVC(),
    'RandomForest': RandomForestClassifier(),
    'DecisionTree': DecisionTreeClassifier(),
    'KNeighbors': KNeighborsClassifier()
}
report = evaluate_model(X_train,y_train,X_test,y_test,models)
# print(report)
for i in range(len(list(models))):    
    print(list(models.keys())[i],"=======>",report[list(models.keys())[i]])  
best_model_score = max(sorted(report.values()))
best_model_name = list(report.keys())[
    list(report.values()).index(best_model_score)
]

best_model = models[best_model_name]

print(f'Best Model Found , Model Name : {best_model_name} , accuracy Score : {best_model_score}')
print('\n====================================================================================\n')    

Best Model Found , Model Name : LogisticRegressor , accuracy Score : 0.9473684210526315


