In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('BreastCancerWc.csv')

print(df.head(5))
df.columns

   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  
0           1                3                1        1

Index(['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [3]:
df.dtypes

Sample code number              int64
Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [4]:
print((df.values=='?').sum())

16


In [5]:
# a. Data cleaning(Remove NA, ?, Negative values etc.)  

df.replace('?',pd.NA,inplace=True)
df.dropna(inplace=True)

In [6]:
# b. Error correcting(Outlier detection and removal)  


# Convert 'Bare Nuclei' to numeric (handle '?')
df['Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')

# Define feature columns (excluding ID and Class)
feature_cols = df.columns.drop(['Sample code number', 'Class'])

# Keep only rows within 1st to 99th percentile for all features
for col in feature_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df = df[(df[col] >= lower) & (df[col] <= upper)]


In [7]:
# c. Data transformation   

scaler=StandardScaler()
features=df.drop(['Sample code number','Class'],axis=1)
feature_scaled=scaler.fit_transform(features)

In [8]:
# d. Build Data model using regression and Naïve Bayes

X=feature_scaled
y=df['Class']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

# Logistic Regression
lr=LogisticRegression()
lr.fit(X_train,y_train)
lr_pred=lr.predict(X_test)

# Naive Bayes
nb=GaussianNB()
nb.fit(X_train,y_train)
nb_pred=nb.predict(X_test)

print('Logistic Regression Accuracy:', accuracy_score(y_test,lr_pred))
print('Naive Bayes Accuracy:', accuracy_score(y_test,nb_pred))


Logistic Regression Accuracy: 0.9854014598540146
Naive Bayes Accuracy: 1.0
