In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
import seaborn as sns
import matplotlib.pyplot as plt

# ------------------------------
# 1. Load Dataset
# ------------------------------
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
# Check missing values
print("Missing values:\n", df.isnull().sum().sum())

# If any missing values exist, fill or drop
df = df.dropna()


Missing values:
 0


In [None]:
X = df.drop('target', axis=1)
y = df['target']


In [None]:
# 1. Min-Max Scaling (Manual)
# Formula: X_scaled = (X - X_min) / (X_max - X_min)
# -------------------------------

def min_max_scaling(df):
    scaled_df = df.copy()
    for col in df.columns:
        col_min = df[col].min()
        col_max = df[col].max()
        scaled_df[col] = (df[col] - col_min) / (col_max - col_min)
    return scaled_df

X_minmax_manual = min_max_scaling(X)

print("After Min-Max Scaling:")
print(X_minmax_manual.head())


# -------------------------------
# 2️. Standardization (Manual)
# Formula: X_std = (X - mean) / std
# -------------------------------

def standardize(df):
    std_df = df.copy()
    for col in df.columns:
        mean = df[col].mean()
        std = df[col].std()
        std_df[col] = (df[col] - mean) / std
    return std_df

X_standardized_manual = standardize(X)

print("\nAfter Standardization:")
print(X_standardized_manual.head())

# -------------------------------
# Verify results
# -------------------------------
print("\nCheck scaling results:")
print("Original Min:", X.min().min(), "Original Max:", X.max().max())
print("Min-Max scaled range:", X_minmax_manual.min().min(), "to", X_minmax_manual.max().max())
print("Standardized mean (approx):", round(X_standardized_manual.mean().mean(), 2))
print("Standardized std (approx):", round(X_standardized_manual.std().mean(), 2))

After Min-Max Scaling:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     0.521037      0.022658        0.545989   0.363733         0.593753   
1     0.643144      0.272574        0.615783   0.501591         0.289880   
2     0.601496      0.390260        0.595743   0.449417         0.514309   
3     0.210090      0.360839        0.233501   0.102906         0.811321   
4     0.629893      0.156578        0.630986   0.489290         0.430351   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0          0.792037        0.703140             0.731113       0.686364   
1          0.181768        0.203608             0.348757       0.379798   
2          0.431017        0.462512             0.635686       0.509596   
3          0.811361        0.565604             0.522863       0.776263   
4          0.347893        0.463918             0.518390       0.378283   

   mean fractal dimension  ...  worst radius  worst texture  worst pe

In [None]:
# Step 1️: Standardize the data manually

X_std = standardize(X)

# -------------------------------
# Step 2️: Compute Covariance Matrix
# -------------------------------
# Cov(X) = (1 / (n-1)) * X^T * X
cov_matrix = np.cov(X_std.T)

print("Covariance Matrix shape:", cov_matrix.shape)

# -------------------------------
# Step 3️: Compute Eigenvalues and Eigenvectors
# -------------------------------
eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

# -------------------------------
# Step 4️: Sort Eigenvalues in Descending Order
# -------------------------------
sorted_indices = np.argsort(eigen_values)[::-1]
eigen_values = eigen_values[sorted_indices]
eigen_vectors = eigen_vectors[:, sorted_indices]

# -------------------------------
# Step 5️: Choose top k components (say k=5)
# -------------------------------
k = 5
eigen_vectors_k = eigen_vectors[:, :k]

# -------------------------------
# Step 6️: Project data onto these k components
# -------------------------------
X_pca_manual = np.dot(X_std, eigen_vectors_k)

print("\nPCA reduced data shape:", X_pca_manual.shape)

# -------------------------------
# Step 7️: Explained Variance Ratio
# -------------------------------
explained_variance_ratio = eigen_values / np.sum(eigen_values)
print("\nExplained Variance Ratio (Top 5 components):")
print(explained_variance_ratio[:5])

# -------------------------------
# Step 8️: Verify
# -------------------------------
print("\nTotal Variance Captured (first 5 components):",
      round(np.sum(explained_variance_ratio[:5]) * 100, 2), "%")

Covariance Matrix shape: (30, 30)

PCA reduced data shape: (569, 5)

Explained Variance Ratio (Top 5 components):
[0.44272026 0.18971182 0.09393163 0.06602135 0.05495768]

Total Variance Captured (first 5 components): 84.73 %


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

# -------------------------------------------
# Load dataset
# -------------------------------------------
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# --------------------------------------------------
# 1️ Function: Correlation-Based Feature Filtering
# --------------------------------------------------
def correlation_filter(df, threshold=0.9):
    """
    Removes features with correlation greater than the given threshold.
    """
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    filtered_df = df.drop(columns=to_drop)
    print(f"Removed {len(to_drop)} correlated features (threshold={threshold})")
    return filtered_df, to_drop


# --------------------------------------------------
# 2️ Function: Chi-Square Feature Selection (Manual)
# --------------------------------------------------
def chi_square_selection(X, y, k=10):
    """
    Manual chi-square feature selection.
    Formula: χ² = Σ((O - E)² / E)
    O = observed frequency
    E = expected frequency
    Only works for non-negative input features.
    """
    # Ensure non-negative data (required for chi-square)
    X = (X - X.min()) / (X.max() - X.min())

    chi_scores = []

    # Compute chi-square for each feature
    for col in X.columns:
        # Convert continuous feature into discrete bins
        binned = pd.cut(X[col], bins=10, labels=False)

        # Build contingency table between feature bins and class labels
        contingency_table = pd.crosstab(binned, y)

        # Compute observed and expected frequencies
        observed = contingency_table.values
        expected = np.outer(contingency_table.sum(axis=1),
                            contingency_table.sum(axis=0)) / np.sum(contingency_table.values)

        # Calculate chi-square statistic
        chi2_val = np.nansum((observed - expected)**2 / (expected + 1e-9))
        chi_scores.append(chi2_val)

    # Rank features by chi-square score
    chi_scores = np.array(chi_scores)
    top_indices = np.argsort(chi_scores)[::-1][:k]
    selected_features = X.columns[top_indices]

    print(f"Top {k} features selected by Chi-Square Test:")
    print(selected_features.to_list())

    return selected_features, chi_scores


# --------------------------------------------------
# Apply Manual Correlation Filter
# --------------------------------------------------
X_corr_filtered, dropped = correlation_filter(X, threshold=0.9)
print("\nDropped Correlated Features:\n", dropped)

# --------------------------------------------------
# Apply Manual Chi-Square Selection
# --------------------------------------------------
selected_features, chi_scores = chi_square_selection(X_corr_filtered, y, k=10)


Removed 10 correlated features (threshold=0.9)

Dropped Correlated Features:
 ['mean perimeter', 'mean area', 'mean concave points', 'perimeter error', 'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst concave points']
Top 10 features selected by Chi-Square Test:
['mean radius', 'mean concavity', 'worst concavity', 'radius error', 'mean compactness', 'worst compactness', 'mean texture', 'concave points error', 'worst smoothness', 'worst symmetry']
