In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

In [6]:
cancer=load_breast_cancer()
x=pd.DataFrame(cancer.data,columns=cancer.feature_names)
y=pd.Series(cancer.target, name="target")

print(x.head(),"\n",y.head())
print(x.shape,y.shape)
print(y.value_counts())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [None]:
#It contains 357 samples and 212 features. 

In [8]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2, random_state=42,stratify=y)
print(xtrain.shape,xtest.shape,ytrain.shape,ytest.shape)

(455, 30) (114, 30) (455,) (114,)


In [None]:
#Similar distribution avoiding imbalance

In [9]:
s=StandardScaler()
s.fit(xtrain)
trainscale=s.transform(xtrain)
testscale=s.transform(xtest)
df1=pd.DataFrame(trainscale, columns=x.columns)
df2=pd.DataFrame(testscale,columns=x.columns)
print(df1.iloc[:,:3].mean(),df2.iloc[:,:3].std())

mean radius      -4.317426e-15
mean texture      2.246067e-15
mean perimeter   -7.383593e-16
dtype: float64 mean radius       1.037502
mean texture      0.879544
mean perimeter    1.033388
dtype: float64


In [None]:
#The test data remains unseen and hence there is no bias in the data

In [None]:
lr=LogisticRegression(solver="liblinear",random_state=42)
lr.fit(trainscale,ytrain)

In [12]:
pre1=lr.predict(testscale)
acc1=accuracy_score(ytest,pre1)
print(acc1)
cm1=confusion_matrix(ytest,pre1)
print(cm1)
rp1=classification_report(ytest,pre1,target_names=["malignant","benign"])
print(rp1)

0.9824561403508771
[[41  1]
 [ 1 71]]
              precision    recall  f1-score   support

   malignant       0.98      0.98      0.98        42
      benign       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [None]:
#Proportion of correct predictions,Precision,true and predicted labels. Overall, the performance is accurate enough

In [20]:
logreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear', random_state=42))
])
cv_scores = cross_val_score(logreg_pipeline, x, y, cv=5, scoring='accuracy')

print("CV scores:", cv_scores)
print("Mean accuracy:", cv_scores.mean())
print("Std deviation:", cv_scores.std())

logreg_pipeline.fit(xtrain,ytrain)
pre2=logreg_pipeline.predict(xtest)
print(accuracy_score(ytest,pre2))

CV scores: [0.98245614 0.97368421 0.97368421 0.97368421 0.99115044]
Mean accuracy: 0.9789318428815402
Std deviation: 0.006990390328940835
0.9824561403508771


In [None]:
#More robust estimate due to evaluation over 5 different folds. The model is accurate and consistent


CV scores: [0.98245614 0.97368421 0.97368421 0.97368421 0.99115044]
Mean accuracy: 0.9789318428815402
Std deviation: 0.006990390328940835


In [23]:
rf_pipeline=Pipeline([("scaler",StandardScaler()),("rf",RandomForestClassifier(n_estimators=100,random_state=42))])
rf_pipeline.fit(xtrain,ytrain)
pre3=rf_pipeline.predict(xtest)
print(accuracy_score(ytest,pre3))
cm2=confusion_matrix(ytest,pre3)
print(cm2)
print(classification_report(ytest,pre3,target_names=["malignant","benign"]))

0.956140350877193
[[39  3]
 [ 2 70]]
              precision    recall  f1-score   support

   malignant       0.95      0.93      0.94        42
      benign       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [None]:
#In this case better performance was that of Logistic Regression Pipeline.It showed better accuracy for two distinct classes as well.Both had precision and accuracy in predicting their cases.