In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('BreastCancerWc.csv', header=None)
data.columns = ['Sample Code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitosis', 'Class']

data.head()

Unnamed: 0,Sample Code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
data.shape

(699, 11)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample Code number           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitosis                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


## **Data cleaning (Remove NA, ?, Negative values, etc.)**

In [5]:
data.isna().sum().sum()

np.int64(0)

In [7]:
(data.values == np.nan).sum()

np.int64(0)

In [8]:
(data.values == '?').sum()

np.int64(16)

In [9]:
data = data.replace('?', np.nan)

In [10]:
data.dropna(inplace=True)

In [11]:
(data.values == '?').sum()

np.int64(0)

## **Error correcting (Outlier detection and removal)**

In [12]:
data.shape

(683, 11)

In [13]:
data = data.astype(float)

In [14]:
def remove_outliers_zscore(data, threshold=3):
  zscore = np.abs((data - data.mean()) / data.std())
  out = zscore > 3
  data = data[~out.any(axis=1)]

  return data

filtered_data = remove_outliers_zscore(data)

In [15]:
filtered_data.head()

Unnamed: 0,Sample Code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,2.0
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,2.0
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,2.0
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,2.0


In [16]:
filtered_data.shape

(630, 11)

In [17]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [27]:
filtered_data['Class'].unique()

array(['Y', 'N'], dtype=object)

In [28]:
filtered_data.loc[filtered_data['Class'] == 2, 'Class'] = 'Y'
filtered_data.loc[filtered_data['Class'] == 4, 'Class'] = 'N'

In [29]:
filtered_data.head()

Unnamed: 0,Sample Code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,Y
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,Y
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,Y
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,Y
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,Y


In [48]:
le = LabelEncoder()
le.fit(filtered_data['Class'])
le.classes_

array([0, 1])

In [59]:
data1 = filtered_data

In [60]:
data1 = data1.copy()  # Make it safe to assign values
data1['Class'] = le.transform(data1['Class'])


In [61]:
data1.head()

Unnamed: 0,Sample Code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,1
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,1
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,1
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,1
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,1


In [62]:
data1['Class'].unique()

array([1, 0])

## **Build Data model using regression and Naive Bayes methods and compare accuracy of benign and malignant tumors in Breast Cancer Dataset**

In [63]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [64]:
X = data1.drop('Class', axis=1)
y = data1['Class']

In [65]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [68]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
reg_pred = reg.predict(x_test)

In [70]:
print('Accuracy of Regression : ', accuracy_score(y_test, reg_pred))

Accuracy of Regression :  0.9576719576719577


In [71]:
navi = GaussianNB()
navi.fit(x_train, y_train)

In [72]:
navi_pred = navi.predict(x_test)

In [73]:
print('Accuracy of Naive Bayes : ', accuracy_score(y_test, navi_pred))

Accuracy of Naive Bayes :  0.783068783068783
