**Importing necessary libraies**

In [37]:
# Necessary Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy.stats import zscore

**Loading the dataset**

In [40]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ["ID", "Diagnosis"] + [f"feature_{i}" for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=columns)

**Dropping unecessary columns**

In [42]:
data = data.drop("ID", axis=1)

**Encodes the "Diagnosis" column into numerical labels**

In [43]:
label_encoder = LabelEncoder()
data["Diagnosis"] = label_encoder.fit_transform(data["Diagnosis"])

**Feature Scaling Using StandardScaler**

In [45]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'data' is your DataFrame
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Apply scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[numerical_columns])

# Create a new DataFrame with scaled features
data_scaled = pd.DataFrame(scaled_features, columns=numerical_columns)

# Optionally, add the target column (assuming it's "Diagnosis")
data_scaled["Diagnosis"] = data["Diagnosis"]

# View the scaled data
print(data_scaled.head())


   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0   1.097064  -2.073335   1.269934   0.984375   1.568466   3.283515   
1   1.829821  -0.353632   1.685955   1.908708  -0.826962  -0.487072   
2   1.579888   0.456187   1.566503   1.558884   0.942210   1.052926   
3  -0.768909   0.253732  -0.592687  -0.764464   3.283553   3.402909   
4   1.750297  -1.151816   1.776573   1.826229   0.280372   0.539340   

   feature_7  feature_8  feature_9  feature_10  ...  feature_22  feature_23  \
0   2.652874   2.532475   2.217515    2.255747  ...   -1.359293    2.303601   
1  -0.023846   0.548144   0.001392   -0.868652  ...   -0.369203    1.535126   
2   1.363478   2.037231   0.939685   -0.398008  ...   -0.023974    1.347475   
3   1.915897   1.451707   2.867383    4.910919  ...    0.133984   -0.249939   
4   1.371011   1.428493  -0.009560   -0.562450  ...   -1.466770    1.338539   

   feature_24  feature_25  feature_26  feature_27  feature_28  feature_29  \
0    2.001237    1.30

**Outlier Detection for Individual Features using Z-scores (Z > 3) with counts and percentages calculated for each feature.**

In [47]:
# Outlier Detection for Individual Features
from scipy.stats import zscore

# Calculate Z-scores directly on the DataFrame's numerical columns
z_scores = data_scaled[numerical_columns].apply(zscore)

# Initialize dictionaries to store outlier count and percentage for each feature
outlier_counts = {}
outlier_percentages = {}

# Loop through each feature to calculate outliers individually
for feature in numerical_columns:
    feature_outliers = np.sum(z_scores[feature] > 3)  # Count of outliers in this feature
    outlier_percentage = (feature_outliers / data_scaled.shape[0]) * 100  # Percentage of outliers

    # Store in dictionaries
    outlier_counts[feature] = feature_outliers
    outlier_percentages[feature] = outlier_percentage

    print(f"Feature: {feature}")
    print(f"Outlier Count: {feature_outliers}")
    print(f"Outlier Percentage: {outlier_percentage:.2f}%\n")

# Print summaries for all features
print("Outlier Counts by Feature:", outlier_counts)

Feature: feature_1
Outlier Count: 5
Outlier Percentage: 0.88%

Feature: feature_2
Outlier Count: 4
Outlier Percentage: 0.70%

Feature: feature_3
Outlier Count: 7
Outlier Percentage: 1.23%

Feature: feature_4
Outlier Count: 8
Outlier Percentage: 1.41%

Feature: feature_5
Outlier Count: 4
Outlier Percentage: 0.70%

Feature: feature_6
Outlier Count: 9
Outlier Percentage: 1.58%

Feature: feature_7
Outlier Count: 9
Outlier Percentage: 1.58%

Feature: feature_8
Outlier Count: 6
Outlier Percentage: 1.05%

Feature: feature_9
Outlier Count: 5
Outlier Percentage: 0.88%

Feature: feature_10
Outlier Count: 7
Outlier Percentage: 1.23%

Feature: feature_11
Outlier Count: 7
Outlier Percentage: 1.23%

Feature: feature_12
Outlier Count: 9
Outlier Percentage: 1.58%

Feature: feature_13
Outlier Count: 8
Outlier Percentage: 1.41%

Feature: feature_14
Outlier Count: 6
Outlier Percentage: 1.05%

Feature: feature_15
Outlier Count: 7
Outlier Percentage: 1.23%

Feature: feature_16
Outlier Count: 12
Outlier Per

**Outlier Detection using Z-scores (Z > 3) and removal of outliers from the dataset.**

In [48]:

# Calculate Z-scores for numerical columns
z_scores = np.abs(zscore(data_scaled[numerical_columns]))

# Identify rows with any Z-score greater than 3 (outliers)
outliers = (z_scores > 3).any(axis=1)

# Filter out rows with outliers
data_no_outliers = data_scaled[~outliers]

# Print the number of outliers and the shape of the datasets
print(f"Number of outliers detected: {np.sum(outliers)}")
print(f"Shape of original data: {data_scaled.shape}")
print(f"Shape of data after removing outliers: {data_no_outliers.shape}")

# Optionally, display the first few rows of the cleaned data
print(data_no_outliers.head())


Number of outliers detected: 74
Shape of original data: (569, 31)
Shape of data after removing outliers: (495, 31)
   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
1   1.829821  -0.353632   1.685955   1.908708  -0.826962  -0.487072   
2   1.579888   0.456187   1.566503   1.558884   0.942210   1.052926   
4   1.750297  -1.151816   1.776573   1.826229   0.280372   0.539340   
5  -0.476375  -0.835335  -0.387148  -0.505650   2.237421   1.244335   
6   1.170908   0.160649   1.138125   1.095295  -0.123136   0.088295   

   feature_7  feature_8  feature_9  feature_10  ...  feature_22  feature_23  \
1  -0.023846   0.548144   0.001392   -0.868652  ...   -0.369203    1.535126   
2   1.363478   2.037231   0.939685   -0.398008  ...   -0.023974    1.347475   
4   1.371011   1.428493  -0.009560   -0.562450  ...   -1.466770    1.338539   
5   0.866302   0.824656   1.005402    1.890005  ...   -0.313836   -0.115009   
6   0.300072   0.646935  -0.064325   -0.762332  ...    0.322883

**Initialize and Train AdaBoost Classifier**

In [62]:
# Initialize AdaBoost Classifier with SAMME algorithm
ada_model = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42, algorithm='SAMME')

# Train the model
ada_model.fit(X_train, y_train)


**Make Predictions**

In [68]:
# Predict on the test set
y_pred = ada_model.predict(X_test)

**Calculate Evaluation Metrics:**

In [71]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


**Print the evaluation Results**

In [73]:
# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
[[61  0]
 [ 0 38]]
