In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# Part A: Data Preparation
# 1. Load the WDBC dataset and display the first five rows.
df = pd.read_csv("WisconsinDiagnosticBreastCancer.csv")
print("First five rows of the dataset:")
print(df.head())

# 2. Drop the 'id' column, replace the 'diagnosis' column values with binary (M=1, B=0), and check for missing values.
df = df.drop('id', axis=1)
df['diagnosis'] = df['diagnosis'].replace({'M': 1, 'B': 0})
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# 3. Compute and visualize the correlation matrix using a heatmap.
correlation_matrix = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap="coolwarm")
plt.title("Correlation Matrix of WDBC Dataset")
plt.show()

# 4. Split the dataset into training (60%), validation (20%), and testing (20%) sets for each class label (diagnosis).
#    Save them as WDBC_Train.csv, WDBC_Validation.csv, and WDBC_Test.csv, respectively.
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
valid_df = pd.concat([X_valid, y_valid], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_csv("WDBC_Train.csv", index=False)
valid_df.to_csv("WDBC_Validation.csv", index=False)
test_df.to_csv("WDBC_Test.csv", index=False)

print("\nTraining, validation, and test datasets saved as CSV files.")

# Part B: Standardization
# 1. Load the training data from CSV file (WDBC_Train.csv).
train_df = pd.read_csv("WDBC_Train.csv")
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']

# 2. Calculate the mean and standard deviation of each training data attribute except the class attribute (diagnosis).
mean = X_train.mean()
std = X_train.std()

# 3. Standardize all the train data attributes except the class attribute. Save the standardized training data as WDBC_Scaled_Train.csv.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
train_scaled_df = pd.concat([X_train_scaled_df, y_train.reset_index(drop=True)], axis=1)
train_scaled_df.to_csv("WDBC_Scaled_Train.csv", index=False)

# 4. Use the mean and standard deviation of training data to perform the standardization of validation and test data.
#    Save the standardized validation and test data as WDBC_Scaled_Validation.csv and WDBC_Scaled_Test.csv, respectively.
valid_df = pd.read_csv("WDBC_Validation.csv")
test_df = pd.read_csv("WDBC_Test.csv")

X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

X_valid_scaled_df = pd.DataFrame(X_valid_scaled, columns=X_valid.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

valid_scaled_df = pd.concat([X_valid_scaled_df, y_valid.reset_index(drop=True)], axis=1)
test_scaled_df = pd.concat([X_test_scaled_df, y_test.reset_index(drop=True)], axis=1)

valid_scaled_df.to_csv("WDBC_Scaled_Validation.csv", index=False)
test_scaled_df.to_csv("WDBC_Scaled_Test.csv", index=False)

print("\nStandardized training, validation, and test datasets saved as CSV files.")

# Part C: Principal Component Analysis (PCA)
# 1. Load the training data from CSV file (WDBC_Train.csv).
train_df = pd.read_csv("WDBC_Train.csv")
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']

valid_df = pd.read_csv("WDBC_Validation.csv")
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']

test_df = pd.read_csv("WDBC_Test.csv")
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Perform the PCA on all attributes except the class attribute.
#    You need to fit and transform the PCA on the original training data. Then, observe the eigenvalues and corresponding eigenvectors.
pca = PCA()
pca.fit(X_train)

# (a) Plot the eigenvalues in the descending order of their values.
eigenvalues = pca.explained_variance_
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.title('PCA: Cumulative Explained Variance')
plt.grid(True)
plt.show()

# (b) Compute the covariance matrix of transformed data and observe its nature and variances.
X_train_transformed = pca.transform(X_train)
covariance_matrix = np.cov(X_train_transformed.T)
print("\nCovariance Matrix of Transformed Data:")
print(covariance_matrix)

pca_2 = PCA(n_components=2)
X_train_pca_2 = pca_2.fit_transform(X_train)
X_valid_pca_2 = pca_2.transform(X_valid)
X_test_pca_2 = pca_2.transform(X_test)

train_pca_2_df = pd.concat([pd.DataFrame(X_train_pca_2), y_train.reset_index(drop=True)], axis=1)
valid_pca_2_df = pd.concat([pd.DataFrame(X_valid_pca_2), y_valid.reset_index(drop=True)], axis=1)
test_pca_2_df = pd.concat([pd.DataFrame(X_test_pca_2), y_test.reset_index(drop=True)], axis=1)

train_pca_2_df.to_csv("WDBC_PCA2_Train.csv", index=False)
valid_pca_2_df.to_csv("WDBC_PCA2_Validation.csv", index=False)
test_pca_2_df.to_csv("WDBC_PCA2_Test.csv", index=False)

pca_10 = PCA(n_components=10)
X_train_pca_10 = pca_10.fit_transform(X_train)
X_valid_pca_10 = pca_10.transform(X_valid)
X_test_pca_10 = pca_10.transform(X_test)

train_pca_10_df = pd.concat([pd.DataFrame(X_train_pca_10), y_train.reset_index(drop=True)], axis=1)
valid_pca_10_df = pd.concat([pd.DataFrame(X_valid_pca_10), y_valid.reset_index(drop=True)], axis=1)
test_pca_10_df = pd.concat([pd.DataFrame(X_test_pca_10), y_test.reset_index(drop=True)], axis=1)

train_pca_10_df.to_csv("WDBC_PCA10_Train.csv", index=False)
valid_pca_10_df.to_csv("WDBC_PCA10_Validation.csv", index=False)
test_pca_10_df.to_csv("WDBC_PCA10_Test.csv", index=False)

print("\nPCA transformed training, validation, and test datasets saved as CSV files.")

# Part D: K-Nearest Neighbors (KNN) classification on original data
# 1. Load train, validation, and test data from WDBC_Train.csv, WDBC_Validation.csv, and WDBC_Test.csv, respectively.
train_df = pd.read_csv("WDBC_Train.csv")
valid_df = pd.read_csv("WDBC_Validation.csv")
test_df = pd.read_csv("WDBC_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=1, 7, 11.
k_values = [1, 7, 11]
knn_classifiers = {}
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    knn_classifiers[k] = knn

# 3. Evaluate and compare the performance metrics on validation data.
validation_results = {}
for k, knn in knn_classifiers.items():
    y_pred = knn.predict(X_valid)
    cm = confusion_matrix(y_valid, y_pred)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)

    validation_results[k] = {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

    print(f"\nKNN with K={k} - Validation Data:")
    print("Confusion Matrix:\n", cm)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

# 4. Plot the classification accuracy vs. K for validation data and choose the best K for the KNN classifier.
accuracies = [validation_results[k]['accuracy'] for k in k_values]
plt.plot(k_values, accuracies, marker='o')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("KNN Accuracy vs. K Value (Validation Data)")
plt.xticks(k_values)
plt.grid(True)
plt.show()

best_k = k_values[np.argmax(accuracies)]
print(f"\nBest K for KNN classifier: {best_k}")

# 5. Now, use the KNN classifier of best K to evaluate and compare the performance metrics on the test data.
best_knn = knn_classifiers[best_k]
y_pred_test = best_knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"\nKNN with Best K={best_k} - Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

# Part E: KNN classification on standardized data
# 1. Load train, validation, and test data from WDBC_Scaled_Train.csv, WDBC_Scaled_Validation.csv, and WDBC_Scaled_Test.csv, respectively.
train_df = pd.read_csv("WDBC_Scaled_Train.csv")
valid_df = pd.read_csv("WDBC_Scaled_Validation.csv")
test_df = pd.read_csv("WDBC_Scaled_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=1, 7, 11.
k_values = [1, 7, 11]
knn_classifiers = {}
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    knn_classifiers[k] = knn

# 3. Evaluate and compare the performance metrics on validation data.
validation_results = {}
for k, knn in knn_classifiers.items():
    y_pred = knn.predict(X_valid)
    cm = confusion_matrix(y_valid, y_pred)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)

    validation_results[k] = {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

    print(f"\nKNN with K={k} - Validation Data:")
    print("Confusion Matrix:\n", cm)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

# 4. Plot the classification accuracy vs. K for validation data and choose the best K for the KNN classifier.
accuracies = [validation_results[k]['accuracy'] for k in k_values]
plt.plot(k_values, accuracies, marker='o')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("KNN Accuracy vs. K Value (Validation Data - Standardized Data)")
plt.xticks(k_values)
plt.grid(True)
plt.show()

best_k = k_values[np.argmax(accuracies)]
print(f"\nBest K for KNN classifier (Standardized Data): {best_k}")

# 5. Now, use the KNN classifier of best K to evaluate and compare the performance metrics on the test data.
best_knn = knn_classifiers[best_k]
y_pred_test = best_knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"\nKNN with Best K={best_k} - Test Data (Standardized Data):")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

# Part F: KNN classification on PCA (l=2) transformed data
# 1. Load train, validation, and test data from WDBC_PCA2_Train.csv, WDBC_PCA2_Validation.csv, and WDBC_PCA2_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA2_Train.csv")
valid_df = pd.read_csv("WDBC_PCA2_Validation.csv")
test_df = pd.read_csv("WDBC_PCA2_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=1, 7, 11.
k_values = [1, 7, 11]
knn_classifiers = {}
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    knn_classifiers[k] = knn

# 3. Evaluate and compare the performance metrics on validation data.
validation_results = {}
for k, knn in knn_classifiers.items():
    y_pred = knn.predict(X_valid)
    cm = confusion_matrix(y_valid, y_pred)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)

    validation_results[k] = {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

    print(f"\nKNN with K={k} - Validation Data (PCA l=2):")
    print("Confusion Matrix:\n", cm)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

# 4. Plot the classification accuracy vs. K for validation data and choose the best K for the KNN classifier.
accuracies = [validation_results[k]['accuracy'] for k in k_values]
plt.plot(k_values, accuracies, marker='o')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("KNN Accuracy vs. K Value (Validation Data - PCA l=2)")
plt.xticks(k_values)
plt.grid(True)
plt.show()

best_k = k_values[np.argmax(accuracies)]
print(f"\nBest K for KNN classifier (PCA l=2): {best_k}")

# 5. Now, use the KNN classifier of best K to evaluate and compare the performance metrics on the test data.
best_knn = knn_classifiers[best_k]
y_pred_test = best_knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"\nKNN with Best K={best_k} - Test Data (PCA l=2):")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

# Part G: KNN classification on PCA (l=10) transformed data
# 1. Load train, validation, and test data from WDBC_PCA10_Train.csv, WDBC_PCA10_Validation.csv, and WDBC_PCA10_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA10_Train.csv")
valid_df = pd.read_csv("WDBC_PCA10_Validation.csv")
test_df = pd.read_csv("WDBC_PCA10_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=1, 7, 11.
k_values = [1, 7, 11]
knn_classifiers = {}
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    knn_classifiers[k] = knn

# 3. Evaluate and compare the performance metrics on validation data.
validation_results = {}
for k, knn in knn_classifiers.items():
    y_pred = knn.predict(X_valid)
    cm = confusion_matrix(y_valid, y_pred)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)

    validation_results[k] = {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

    print(f"\nKNN with K={k} - Validation Data (PCA l=10):")
    print("Confusion Matrix:\n", cm)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)

# 4. Plot the classification accuracy vs. K for validation data and choose the best K for the KNN classifier.
accuracies = [validation_results[k]['accuracy'] for k in k_values]
plt.plot(k_values, accuracies, marker='o')
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.title("KNN Accuracy vs. K Value (Validation Data - PCA l=10)")
plt.xticks(k_values)
plt.grid(True)
plt.show()

best_k = k_values[np.argmax(accuracies)]
print(f"\nBest K for KNN classifier (PCA l=10): {best_k}")

# 5. Now, use the KNN classifier of best K to evaluate and compare the performance metrics on the test data.
best_knn = knn_classifiers[best_k]
y_pred_test = best_knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

print(f"\nKNN with Best K={best_k} - Test Data (PCA l=10):")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)


ModuleNotFoundError: No module named 'pandas'