In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# END RESULT: https://breast-cancer-model.herokuapp.com/

Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image.
n the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992, 23-34].

This database is also available through the UW CS ftp server:
ftp ftp.cs.wisc.edu
cd math-prog/cpo-dataset/machine-learn/WDBC/

Also can be found on UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

Attribute Information:

1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

Ten real-valued features are computed for each cell nucleus:

*  radius (mean of distances from center to points on the perimeter)
*  texture (standard deviation of gray-scale values)
*  perimeter
*  area
*  smoothness (local variation in radius lengths)
*  compactness (perimeter^2 / area - 1.0)
*  concavity (severity of concave portions of the contour)
*  concave points (number of concave portions of the contour)
*  symmetry
*  fractal dimension ("coastline approximation" - 1)

The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

All feature values are recoded with four significant digits.

Missing attribute values: none

Class distribution: 357 benign, 212 malignant

**PROJECT DETAILS:**
Breast cancer (BC) is one of the most common cancers among women worldwide, representing the majority of new cancer cases and cancer-related deaths according to global statistics, making it a significant public health problem in today’s society. The early diagnosis of BC can improve the prognosis and chance of survival significantly, as it can promote timely clinical treatment to patients. ML techniques are being broadly used in the breast cancer classification problem. They provide high classification accuracy and effective diagnostic capabilities.

Task -

* Data importing, cleaning and Inspecting (check whether any null/duplicate values are present)
* Data Preprocessing
* EDA
* Label Encoding (if required)
* Perform PCA for dimensionality reduction
* Model Building - Select the best performing classification model as final model, based upon highest accuracy score.
* Deploy it using Flask/Streamlit.

In [None]:
import pandas as pd 
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import os
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

# Data importing, cleaning and Inspecting 

In [None]:
df=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
#temp=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
df.head()

In [None]:
df.shape                                          #569 rows 33 columns

In [None]:
df.info()

* Since all data (apart from diagnosis) is in numerical form, label encoding is not required.
* Drop column 32.
* No null values in other columns.

In [None]:
df.drop(columns=['Unnamed: 32'],axis=1,inplace=True)

Check for duplicate entries: 

In [None]:
dup = df[df.duplicated('id')]
dup

No duplicate values.

In [None]:
df.drop(columns=['id']).describe()

Drop column id

In [None]:
#df.drop(columns=['id'],inplace=True)

# EDA

In [None]:
df.drop(columns=['id']).hist(bins=20,figsize=(18, 16))

In [None]:
plt.subplots(figsize=(20,20))
sns.heatmap(df.drop(columns=['id']).corr());

In [None]:
cols = df.drop(['id','diagnosis'], axis=1)
fig, ax = plt.subplots(figsize=(100,25))       
cols.boxplot(ax=ax)

Clearly the data has different range of values and outliers.

# MODEL

In [None]:
features = df.drop(columns=['diagnosis','id'])
y = df['diagnosis']
features.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(features)

PCA:

In [None]:
pca = PCA()
X = pca.fit_transform(x_scaled) 
X.shape

Test-train split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(features,y,test_size = 0.30,random_state = 42)

In [None]:
lr = LogisticRegression(random_state=0)
lr_pred = lr.fit(X_train,y_train).predict(X_test)

svmclf=svm.LinearSVC(random_state=0)
svm_pred=svmclf.fit(X_train,y_train).predict(X_test)

gnb = GaussianNB()
gnb_pred = gnb.fit(X_train, y_train).predict(X_test)

dt = DecisionTreeClassifier(random_state=0)
dt_pred = dt.fit(X_train, y_train).predict(X_test)

rf = RandomForestClassifier(max_depth=2,random_state=0)
rf_pred = rf.fit(X, y).predict(X_test)

knn = KNeighborsClassifier()
knn_pred = knn.fit(X, y).predict(X_test)

In [None]:
models=['LR','SVM','GNB','DT','RF','KNN']
preds=[lr_pred,svm_pred,gnb_pred,dt_pred,rf_pred,knn_pred]
acc=[]
for i in preds:
    accscore=accuracy_score(i,y_test).round(2)
    acc.append(accscore)
data=zip(models,acc)
scoresdf=pd.DataFrame(data,columns=['MODEL','ACCURACY SCORE']) 
scoresdf

Linear Regression has the highest accuracy score.

Evaluate LR model

In [None]:
print(confusion_matrix(y_test,lr_pred))

In [None]:
print(classification_report(y_test,lr_pred))

In [None]:
pickle.dump(lr, open('model.pkl', 'wb'))
model = pickle.load(open('model.pkl', 'rb'))