In [None]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# PREPARING THE DATASET
dataset = pd.read_csv('breast_cancer.csv')
X = dataset.iloc[:,1:-1].values
y= dataset.iloc[:,-1].values
y=np.where(y==2,0,1) # 2 is malignant and 4 is benign

#FOR PLOTTING THE DATA
models = []
accuracies = []
std_devs = []

sns.pairplot(dataset)

In [None]:
#SPLITTING THE DATASET INTO TRAINING SET AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=0)

In [None]:
#USING LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression(random_state=0)
classifier_lr.fit(X_train, y_train)

#k-fold cross validation
accuracies_lr = cross_val_score(estimator = classifier_lr, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('Logistic Regression')
accuracies.append(accuracies_lr.mean() * 100)
std_devs.append(accuracies_lr.std() * 100)


In [None]:
#USING K-NEAREST NEIGHBOURS
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

#k-fold cross validation
accuracies_knn = cross_val_score(estimator = classifier_knn, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('K-Nearest Neighbours')
accuracies.append(accuracies_knn.mean() * 100)
std_devs.append(accuracies_knn.std() * 100)

In [None]:
#USING SUPPORT VECTOR MACHINE
from sklearn.svm import SVC
classifier_svm = SVC(kernel = 'linear', random_state = 0)
classifier_svm.fit(X_train, y_train)

#k-fold cross validation
accuracies_svm = cross_val_score(estimator = classifier_svm, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('Support Vector Machine')
accuracies.append(accuracies_svm.mean() * 100)
std_devs.append(accuracies_svm.std() * 100)

In [None]:
#USING KERNEL SUPPORT VECTOR MACHINE
from sklearn.svm import SVC
classifier_ksvm = SVC(kernel = 'rbf', random_state = 0)
classifier_ksvm.fit(X_train, y_train)

#k-fold cross validation
accuracies_ksvm = cross_val_score(estimator = classifier_ksvm, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('Kernel Support Vector Machine')
accuracies.append(accuracies_ksvm.mean() * 100)
std_devs.append(accuracies_ksvm.std() * 100)

In [None]:
#USING RANDOM FOREST CLASSIFIER
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

#k-fold cross validation
accuracies_rf = cross_val_score(estimator = classifier_rf, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('Random Forest Classifier')
accuracies.append(accuracies_rf.mean() * 100)
std_devs.append(accuracies_rf.std() * 100)

In [None]:
#USING NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)

#k-fold cross validation
accuracies_nb = cross_val_score(estimator = classifier_nb, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('Naive Bayes')
accuracies.append(accuracies_nb.mean() * 100)
std_devs.append(accuracies_nb.std() * 100)

In [None]:
#USING DECISION TREE CLASSIFIER
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

#k-fold cross validation
accuracies_dt = cross_val_score(estimator = classifier_dt, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('Decision Tree Classifier')
accuracies.append(accuracies_dt.mean() * 100)
std_devs.append(accuracies_dt.std() * 100)

In [None]:
#USING XGBOOST 
from xgboost import XGBClassifier
classifier_xg = XGBClassifier(objective='binary:logistic')
classifier_xg.fit(X_train, y_train)

#k-fold cross validation
accuracies_xg = cross_val_score(estimator = classifier_xg, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('XGBoost')
accuracies.append(accuracies_xg.mean() * 100)
std_devs.append(accuracies_xg.std() * 100)

In [None]:
#USING CATBOOST
from catboost import CatBoostClassifier
classifier_cat = CatBoostClassifier()
classifier_cat.fit(X_train, y_train)

#k-fold cross validation
accuracies_cat = cross_val_score(estimator = classifier_cat, X = X_train, y = y_train, cv = 10)

#DATA GATHERING FOR PLOTTING
models.append('CatBoost')
accuracies.append(accuracies_cat.mean() * 100)
std_devs.append(accuracies_cat.std() * 100)

In [None]:
#PRINTING ACCURACY AND STANDARD DEVIATION
print("Accuracy with Logistic Regression: {:.2f} %".format(accuracies_lr.mean()*100))
print("Standard Deviation with Logistic Regression: {:.2f} %".format(accuracies_lr.std()*100))
print("___________________________________________________")
print("Accuracy with K-Nearest Neighbours: {:.2f} %".format(accuracies_knn.mean()*100))
print("Standard Deviation with K-Nearest Neighbours: {:.2f} %".format(accuracies_knn.std()*100))
print("___________________________________________________")
print("Accuracy with Support Vector Machine: {:.2f} %".format(accuracies_svm.mean()*100))
print("Standard Deviation with Support Vector Machine: {:.2f} %".format(accuracies_svm.std()*100))
print("___________________________________________________")
print("Accuracy with Kernel Support Vector Machine: {:.2f} %".format(accuracies_ksvm.mean()*100))
print("Standard Deviation with Kernel Support Vector Machine: {:.2f} %".format(accuracies_ksvm.std()*100))
print("___________________________________________________")
print("Accuracy with Randon Forest: {:.2f} %".format(accuracies_rf.mean()*100))
print("Standard Deviation with Random Forest: {:.2f} %".format(accuracies_rf.std()*100))
print("___________________________________________________")
print("Accuracy with Naive Bayes: {:.2f} %".format(accuracies_nb.mean()*100))
print("Standard Deviation with Naive Bayes: {:.2f} %".format(accuracies_nb.std()*100))
print("___________________________________________________")
print("Accuracy with Decision Tree: {:.2f} %".format(accuracies_dt.mean()*100))
print("Standard Deviation with Decision Tree: {:.2f} %".format(accuracies_dt.std()*100))
print("___________________________________________________")
print("Accuracy with XGBoost: {:.2f} %".format(accuracies_xg.mean()*100))
print("Standard Deviation with XGBoost: {:.2f} %".format(accuracies_xg.std()*100))
print("___________________________________________________")
print("Accuracy with CatBoost: {:.2f} %".format(accuracies_cat.mean()*100))
print("Standard Deviation with CatBoost: {:.2f} %".format(accuracies_cat.std()*100))

In [None]:
#PLOTTING THE ACCURACY AND STANDARD DEVIATION
import matplotlib.pyplot as plt

# Make sure models, accuracies, and std_devs are already populated
plt.figure(figsize=(12, 6))
bars = plt.bar(models, accuracies, yerr=std_devs, capsize=7, color='red', edgecolor='black')

plt.xticks(rotation=45, ha='right')
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracy Comparison (with Standard Deviation)')
plt.ylim([min(accuracies) - 5, 100])
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Add accuracy and std deviation labels
for bar, acc, std in zip(bars, accuracies, std_devs):
    bar_x = bar.get_x() + bar.get_width() / 2.0
    bar_height = bar.get_height()

    # Accuracy label just above the bar
    plt.text(bar_x, bar_height + 0.3, f'{acc:.2f}%', ha='center', va='bottom', fontsize=9, color='black')
    
    # Standard deviation label above the error bar
    plt.text(bar_x, bar_height + std + 0.3, f'±{std:.2f}', ha='center', va='bottom', fontsize=8, color='blue')

plt.tight_layout()
plt.show()

In [None]:
#GOING FORWARD WITH KNN MODEL AS IT HAS BEST ACCURACY
##APPLYING K-FOLDS CROSS VALIDATION & GRID SEARCH

print("K-Fold validation Accuracy with K-Nearest Neighbours: {:.2f} %".format(accuracies_knn.mean()*100))
print("K-Fold validation Standard Deviation with K-Nearest Neighbours: {:.2f} %".format(accuracies_knn.std()*100))
print("______________________________________________")

#Applying Grid Search for Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_neighbors': list(range(3, 21, 2)),          # Try odd values between 3 and 20
    'weights': ['uniform', 'distance'],            # 'uniform' = equal weight, 'distance' = closer neighbors weigh more
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metric
    'p': [1, 2]                                     # For Minkowski: 1 = manhattan, 2 = euclidean
}
grid_search = GridSearchCV(estimator = classifier_knn,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("______________________________________________")
print("Best Accuracy after Grid Search: {:.2f} %".format(best_accuracy*100))
print("Best Parameters after Grid Search:", best_parameters)


In [None]:
#USING KNN WITH BEST PARAMETERS
from sklearn.neighbors import KNeighborsClassifier
classifier_knn_updated = KNeighborsClassifier(n_neighbors = 9, metric = 'euclidean', p = 1, weights='uniform')
classifier_knn_updated.fit(X_train, y_train)

In [None]:
#CONFUSION MATRIX AND ACCURACY SCORE
y_pred = classifier_knn_updated.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
#PREDICTING A NEW RESULT
classifier_knn_updated.predict([[8,7,5,10,7,9,5,5,4]])
if(classifier_knn_updated.predict([[8,7,5,10,7,9,5,5,4]])==1):
    print("The Tumor is Benign")
else:
    print("The Tumor is Malignant")