In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [57]:
# Load dataset without headers
data = pd.read_csv('~/Downloads/data/wdbc.data', header=None)  # Ensure you have the correct file path

# Assign column names manually
columns = ['ID', 'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1',
           'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2',
           'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2',
           'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3',
           'concave_points3', 'symmetry3', 'fractal_dimension3']
data.columns = columns

# Drop ID column
data = data.drop(columns=['ID'])

In [59]:
# Convert Diagnosis to numerical values
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

In [61]:
# Select only the ten real-valued features (mean, standard error, worst)
selected_features = ['radius1', 'texture1', 'perimeter1',
                     'radius2', 'texture2',
                     'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3']
X = data[selected_features]
y = data['Diagnosis']


In [63]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# Initialize the DecisionTreeClassifier
# max_depth = 5 is a starting point to avoid overfitting
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train the model on the training data
dt_model.fit(X_train, y_train)


In [67]:
# Make predictions on the test set
y_pred = dt_model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report (precision, recall, f1-score)
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Confusion Matrix:
[[69  2]
 [ 2 41]]
Accuracy: 0.9649
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        71
           1       0.95      0.95      0.95        43

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



In [69]:
# Calculating Train and Test Accuracy for signs of overfitting
# Train the model on the training data
dt_model.fit(X_train, y_train)

# Calculate training accuracy
train_accuracy = dt_model.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Calculate test accuracy
test_accuracy = dt_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Training Accuracy: 0.9912
Test Accuracy: 0.9649


In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Define the hyperparameters to tune
param_grid = {
    'max_depth': [3, 5, 10, None],  # Try different max depths
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required to be at a leaf node
    'criterion': ['gini', 'entropy']  # Splitting criteria
}

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters Found: ", grid_search.best_params_)

# Use the best model from GridSearchCV
best_dt_model = grid_search.best_estimator_

# Calculate training accuracy of the best model
train_accuracy_tuned = best_dt_model.score(X_train, y_train)
print(f"Training Accuracy (Tuned): {train_accuracy_tuned:.4f}")

# Calculate test accuracy of the best model
test_accuracy_tuned = best_dt_model.score(X_test, y_test)
print(f"Test Accuracy (Tuned): {test_accuracy_tuned:.4f}")

# Predict on test data
y_pred = best_dt_model.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Compute precision, recall, and F1-score
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters Found:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Training Accuracy (Tuned): 0.9802
Test Accuracy (Tuned): 0.9649
Confusion Matrix:
 [[69  2]
 [ 2 41]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        71
           1       0.95      0.95      0.95        43

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



In [36]:
# Standardize data for KNN and Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Initialize the KNN model
# Let's start with k=5 as a baseline
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
knn_model.fit(X_train_scaled, y_train)

# Calculate training accuracy
train_accuracy_knn = knn_model.score(X_train_scaled, y_train)
print(f"Training Accuracy (KNN): {train_accuracy_knn:.4f}")

# Calculate test accuracy
test_accuracy_knn = knn_model.score(X_test_scaled, y_test)
print(f"Test Accuracy (KNN): {test_accuracy_knn:.4f}")

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test_scaled)

# Confusion Matrix
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix (KNN):")
print(conf_matrix_knn)

# Classification Report
class_report_knn = classification_report(y_test, y_pred_knn)
print("Classification Report (KNN):")
print(class_report_knn)


Training Accuracy (KNN): 0.9758
Test Accuracy (KNN): 0.9737
Confusion Matrix (KNN):
[[70  1]
 [ 2 41]]
Classification Report (KNN):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [40]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Try different values of k
    'metric': ['euclidean', 'manhattan']  # Try different distance metrics
}

# Initialize the KNN model
knn_model = KNeighborsClassifier()

# Setup GridSearchCV
grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_grid_knn, 
                               cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

# Fit the grid search
grid_search_knn.fit(X_train_scaled, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters Found for KNN: ", grid_search_knn.best_params_)

# Use the best model from GridSearchCV
best_knn_model = grid_search_knn.best_estimator_

# Calculate training accuracy of the best KNN model
train_accuracy_knn_tuned = best_knn_model.score(X_train_scaled, y_train)
print(f"Training Accuracy (Tuned KNN): {train_accuracy_knn_tuned:.4f}")

# Calculate test accuracy of the best KNN model
test_accuracy_knn_tuned = best_knn_model.score(X_test_scaled, y_test)
print(f"Test Accuracy (Tuned KNN): {test_accuracy_knn_tuned:.4f}")

# Make predictions with the tuned model
y_pred_knn_tuned = best_knn_model.predict(X_test_scaled)

# Confusion Matrix (Tuned)
conf_matrix_knn_tuned = confusion_matrix(y_test, y_pred_knn_tuned)
print("Confusion Matrix (Tuned KNN):")
print(conf_matrix_knn_tuned)

# Classification Report (Tuned)
class_report_knn_tuned = classification_report(y_test, y_pred_knn_tuned)
print("Classification Report (Tuned KNN):")
print(class_report_knn_tuned)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters Found for KNN:  {'metric': 'euclidean', 'n_neighbors': 5}
Training Accuracy (Tuned KNN): 0.9758
Test Accuracy (Tuned KNN): 0.9737
Confusion Matrix (Tuned KNN):
[[70  1]
 [ 2 41]]
Classification Report (Tuned KNN):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Initialize the Logistic Regression model
# Let's start with the default solver 'liblinear' for smaller datasets
log_reg_model = LogisticRegression(random_state=42, solver='liblinear')

# Train the model on the scaled training data
log_reg_model.fit(X_train_scaled, y_train)

# Calculate training accuracy
train_accuracy_log_reg = log_reg_model.score(X_train_scaled, y_train)
print(f"Training Accuracy (Logistic Regression): {train_accuracy_log_reg:.4f}")

# Calculate test accuracy
test_accuracy_log_reg = log_reg_model.score(X_test_scaled, y_test)
print(f"Test Accuracy (Logistic Regression): {test_accuracy_log_reg:.4f}")

# Make predictions on the test set
y_pred_log_reg = log_reg_model.predict(X_test_scaled)

# Confusion Matrix
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix (Logistic Regression):")
print(conf_matrix_log_reg)

# Classification Report
class_report_log_reg = classification_report(y_test, y_pred_log_reg)
print("Classification Report (Logistic Regression):")
print(class_report_log_reg)


Training Accuracy (Logistic Regression): 0.9758
Test Accuracy (Logistic Regression): 0.9737
Confusion Matrix (Logistic Regression):
[[70  1]
 [ 2 41]]
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [49]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Logistic Regression
param_grid_log_reg = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Solvers to test
    'penalty': ['l2', 'l1'],  # Regularization type (L1 and L2)
}

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

# Setup GridSearchCV
grid_search_log_reg = GridSearchCV(estimator=log_reg_model, param_grid=param_grid_log_reg, 
                                   cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

# Fit the grid search
grid_search_log_reg.fit(X_train_scaled, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters Found for Logistic Regression: ", grid_search_log_reg.best_params_)

# Use the best model from GridSearchCV
best_log_reg_model = grid_search_log_reg.best_estimator_

# Calculate training accuracy of the best Logistic Regression model
train_accuracy_log_reg_tuned = best_log_reg_model.score(X_train_scaled, y_train)
print(f"Training Accuracy (Tuned Logistic Regression): {train_accuracy_log_reg_tuned:.4f}")

# Calculate test accuracy of the best Logistic Regression model
test_accuracy_log_reg_tuned = best_log_reg_model.score(X_test_scaled, y_test)
print(f"Test Accuracy (Tuned Logistic Regression): {test_accuracy_log_reg_tuned:.4f}")

# Make predictions with the tuned model
y_pred_log_reg_tuned = best_log_reg_model.predict(X_test_scaled)

# Confusion Matrix (Tuned)
conf_matrix_log_reg_tuned = confusion_matrix(y_test, y_pred_log_reg_tuned)
print("Confusion Matrix (Tuned Logistic Regression):")
print(conf_matrix_log_reg_tuned)

# Classification Report (Tuned)
class_report_log_reg_tuned = classification_report(y_test, y_pred_log_reg_tuned)
print("Classification Report (Tuned Logistic Regression):")
print(class_report_log_reg_tuned)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters Found for Logistic Regression:  {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Training Accuracy (Tuned Logistic Regression): 0.9758
Test Accuracy (Tuned Logistic Regression): 0.9912
Confusion Matrix (Tuned Logistic Regression):
[[71  0]
 [ 1 42]]
Classification Report (Tuned Logistic Regression):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        71
           1       1.00      0.98      0.99        43

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\samee\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\samee\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\samee\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^