<a href="https://colab.research.google.com/github/Niharika1-hash/Breast-Cancer-Detection-using-Machine-Learning/blob/main/Breast_Cancer_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project : Breast Cancer Prediction**
***By*** 

***Niharika Poddar : 1RN18CS068***

***Pooja R : 1RN18CS071***






## **Problem Statement**

Predict whether a patient has Breast Cancer or not, given a labelled dataset of Breast Cancer Tumour attributes for training using various ML techniques


**Training and Test Dataset source** : https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

## **Downloading Dataset from Kaggle**

In [7]:
! pip install kaggle



In [8]:
! mkdir ~/.kaggle

In [9]:
! cp kaggle.json ~/.kaggle/

In [10]:
! chmod 600 ~/.kaggle/kaggle.json

In [16]:
! kaggle datasets download -d uciml/breast-cancer-wisconsin-data

breast-cancer-wisconsin-data.zip: Skipping, found more recently modified local copy (use --force to force download)


In [21]:
! unzip breast-cancer-wisconsin-data.zip

Archive:  breast-cancer-wisconsin-data.zip
  inflating: data.csv                


## **Dataset Analysis and Preprocessing**


 > ### Import libraries







In [1]:
!pip install sklearn
#importing libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy
import seaborn as sns
import sklearn




In [22]:
!ls


breast-cancer-wisconsin-data.zip  data.csv  kaggle.json  sample_data




> ### Dataset Description



In [24]:
df = pd.read_csv("data.csv")

In [None]:
df.head()

In [None]:
#data preprocessing
df.info() #to find the number of null values in each column

In [None]:
#remove column no. 32
df = df.dropna(axis=1)
df.shape

In [None]:
df.describe()


In [None]:
#count of malignant and benign instances
df['diagnosis'].value_counts()

In [None]:
sns.countplot(df['diagnosis'])



> ### Relabelling Target Values



In [None]:
#encoded label M as 1 and B as 0

labelencoder_Y = LabelEncoder()
df.iloc[:,1] = labelencoder_Y.fit_transform(df.iloc[:,1].values)

In [None]:
df.head()


> ### Data Visualisation

In [None]:
sns.pairplot(df.iloc[:,1:5],hue="diagnosis")

In [None]:
#get the correlation
df.iloc[:,1:32].corr()

In [None]:
#visualization of correlation
plt.figure(figsize=(10,10))
sns.heatmap(df.iloc[:,1:10].corr(),annot=True,fmt=".0%")

> ### Splitting Data into Training Dataset and Testing Dataset

In [None]:
#split dataset into dependent(x) and independent(y) variables
x= df.iloc[:,2:31].values
y= df.iloc[:,1].values
print(x)

In [None]:
#split dataset into training and testing dataset

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)

In [None]:
#feature scaling

x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)

## **Technique 1: Classifiers**

> ### Import Libraries

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

> ### Logistic Regression


In [None]:

def logistic_regression(x_train,y_train):
  log = LogisticRegression(random_state = 0)
  log.fit(x_train,y_train)
  return log


> ### Decision Tree

In [None]:
def decision_tree(x_train,y_train):
  tree = DecisionTreeClassifier(random_state = 0, criterion = "entropy")
  tree.fit(x_train,y_train)
  return tree

> ### Random Forest

In [None]:
def random_forest(x_train,y_train):
  forest = RandomForestClassifier(random_state = 0, criterion = "entropy",n_estimators = 10)
  forest.fit(x_train,y_train)
  return forest

In [None]:
model=[]
model.append(logistic_regression(x_train,y_train))
model.append(decision_tree(x_train,y_train))
model.append(random_forest(x_train,y_train))

> ### Accuracy Report


In [None]:
#testing the model/result
for i in range(len(model)):
  print('\n')
  print("Model",i)
  print(classification_report(y_test,model[i].predict(x_test)))
  print('Accuracy : ',accuracy_score(y_test,model[i].predict(x_test)))

> ### Prediction


In [None]:
#prediction of random forest
pred = model[2].predict(x_test)
print('Predicted Values:\n',pred)
print('Actual Values:\n',y_test)

## **Technique 2: Feature Selection with Random Forest**

> ### Splitting Data into Training Dataset and Testing Dataset

In [None]:
# split data train 70 % and test 30, this time with x and not x_1 in order to have all the features %
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

> ### Creation of RFE object and Ranking each Feature

In [None]:

# Create the RFE object and rank each feature
clf_rf_2 = RandomForestClassifier(random_state=43)      
rfe = RFE(estimator=clf_rf_2, n_features_to_select=16, step=1)
rfe = rfe.fit(x_train, y_train)


> ### Accuracy Report and Confusion Matrix




In [None]:
recall = recall_score(y_test,rfe.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,rfe.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,rfe.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,rfe.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

## **Technique 3: CNN**

> ### Import Libraries

In [None]:
!pip install tensorflow-gpu==2.0.0

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPool1D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

> ### Splitting Data into Training Dataset and Testing Dataset

In [None]:
# split data train 70 % and test 30, this time with x and not x_1 in order to have all the features %
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

> ### Scaling the Training and Testing Dataset


In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train = x_train.reshape(x_train.shape[0],x_train.shape[1],1)
x_test = x_test.reshape(x_test.shape[0],x_test.shape[1],1)

> ### Training the CNN 

In [None]:
epochs = 300
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=32,kernel_size = 2,activation = 'relu',input_shape=(29,1)))
cnn_model.add(BatchNormalization())
cnn_model.add(Dropout(0.2))


cnn_model.add(Conv1D(filters=32,kernel_size = 2,activation = 'relu'))
cnn_model.add(BatchNormalization())
cnn_model.add(Dropout(0.5))


cnn_model.add(Flatten())
cnn_model.add(Dense(64, activation = 'relu'))
cnn_model.add(Dropout(0.5))


cnn_model.add(Dense(1, activation = 'sigmoid'))

> ### Training Summary

In [None]:
cnn_model.summary()

In [None]:
cnn_model.compile(optimizer = Adam(lr = 0.00005), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
def plot_learningCurve(history,epoch):
  epoch_range = range(1,epoch+1)
  plt.plot(epoch_range,history.history['accuracy'])
  plt.plot(epoch_range, history.history['val_accuracy'])
  plt.title('Model accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train','Val'], loc = 'upper left')
  plt.show()

  plt.plot(epoch_range,history.history['loss'])
  plt.plot(epoch_range,history.history['val_loss'])
  plt.title('Model Loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train','Val'], loc = 'upper left')
  plt.show()

In [None]:
history = cnn_model.fit(x_train,y_train,epochs = epochs, validation_data=(x_test,y_test), verbose = 1)

> ### Learning Curve Visualisation

In [None]:
plot_learningCurve(history,epochs)

## **Final Output Comparison**