In [3]:
#import necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score, classification_report



In [4]:
# Load the Breast Cancer Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_names = ["Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
                "Bare Nuclei", "Bland Chromatin","Normal Nucleoli", "Mitoses", "Class"]
df = pd.read_csv(url, names=column_names)

In [5]:
df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2,1,1,1,2
841769,2,1,1,1,2,1,1,1,1,2
888820,5,10,10,3,7,3,8,10,2,4
897471,4,8,6,4,3,4,10,6,1,4


In [6]:
# Data Preprocessing

# Check for missing values and handle them (if any)
# In this dataset, missing values are represented by "?" so replace them with NaN
df.replace('?', pd.NA, inplace=True)
df = df.dropna()

# Convert the 'Bare Nuclei' column to numeric
df.loc[:, 'Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')

# Convert the 'Class' column to binary labels (2 for benign, 4 for malignant)
df.loc[:, 'Class'] = df['Class'].map({2: 0, 4: 1})



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')
  df.loc[:, 'Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'Class'] = df['Class'].map({2: 0, 4: 1})


In [7]:
# Split the dataset into features (X) and target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, X_train, X_test, y_train, and y_test are the training and testing sets.


In [8]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

In [9]:
# Build the neural network model
model_ann = Sequential()
model_ann.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model_ann.add(Dense(64, activation='relu'))
model_ann.add(Dense(1, activation='sigmoid'))

# Compile the model
model_ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the scaled training set
model_ann.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions on the scaled testing set
y_pred_ann = (model_ann.predict(X_test) > 0.5).astype("int32")

# Evaluate the model's performance
accuracy_ann = accuracy_score(y_test, y_pred_ann)
classification_rep_ann = classification_report(y_test, y_pred_ann)

# Print the results
print("ANN Accuracy:", accuracy_ann)
print("\nANN Classification Report:\n", classification_rep_ann)





Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
ANN Accuracy: 0.9708029197080292

ANN Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        79
           1       0.98      0.95      0.96        58

    accuracy                           0.97       137
   macro avg       0.97      0.97      0.97       137
weighted avg       0.97      0.97      0.97       137



In [10]:
# Create a Decision Tree model
model_decision_tree = DecisionTreeClassifier(random_state=42)

# Train the model on the training set
model_decision_tree.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_decision_tree = model_decision_tree.predict(X_test)

# Evaluate the model's performance
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
classification_rep_decision_tree = classification_report(y_test, y_pred_decision_tree)

# Print the results
print("Decision Tree Accuracy:", accuracy_decision_tree)
print("\nDecision Tree Classification Report:\n", classification_rep_decision_tree)




Decision Tree Accuracy: 0.9343065693430657

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94        79
           1       0.96      0.88      0.92        58

    accuracy                           0.93       137
   macro avg       0.94      0.93      0.93       137
weighted avg       0.94      0.93      0.93       137



In [11]:
# Create a Random Forest model
model_random_forest = RandomForestClassifier(random_state=42)

# Train the model on the training set
model_random_forest.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_random_forest = model_random_forest.predict(X_test)

# Evaluate the model's performance
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
classification_rep_random_forest = classification_report(y_test, y_pred_random_forest)

# Print the results
print("Random Forest Accuracy:", accuracy_random_forest)
print("\nRandom Forest Classification Report:\n", classification_rep_random_forest)




Random Forest Accuracy: 0.9562043795620438

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96        79
           1       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137

