<a href="https://colab.research.google.com/github/Sparkashok/Machine-Learning/blob/main/ML_FeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
data = iris.data
feature_names = iris.feature_names

# 1. Compute variance without using inbuilt function
def compute_variance(data):
    means = np.mean(data, axis=0)
    variance = np.sum((data - means) ** 2, axis=0) / (data.shape[0] - 1)
    return variance

feature_variance = compute_variance(data)
sorted_variance_indices = np.argsort(feature_variance)[::-1]
print("Feature Variance in Descending Order:")
for i in sorted_variance_indices:
    print(f"{feature_names[i]}: {feature_variance[i]:.5f}")

# 2. Compute correlation matrix without using inbuilt function
def compute_correlation_matrix(data):
    num_features = data.shape[1]
    correlation_matrix = np.zeros((num_features, num_features))
    for i in range(num_features):
        for j in range(num_features):
            mean_i = np.mean(data[:, i])
            mean_j = np.mean(data[:, j])
            std_i = np.sqrt(np.sum((data[:, i] - mean_i) ** 2) / (data.shape[0] - 1))
            std_j = np.sqrt(np.sum((data[:, j] - mean_j) ** 2) / (data.shape[0] - 1))
            covariance = np.sum((data[:, i] - mean_i) * (data[:, j] - mean_j)) / (data.shape[0] - 1)
            correlation_matrix[i, j] = covariance / (std_i * std_j)
    return correlation_matrix

correlation_matrix = compute_correlation_matrix(data)
print("\nCorrelation Matrix:")
print(correlation_matrix)

# 2a. Find the highest correlation pair (excluding diagonal)
num_features = data.shape[1]
highest_corr = -1
highest_pair = None
for i in range(num_features):
    for j in range(i + 1, num_features):
        if correlation_matrix[i, j] > highest_corr:
            highest_corr = correlation_matrix[i, j]
            highest_pair = (feature_names[i], feature_names[j])

print(f"\nHighest Correlation Pair: {highest_pair} with value {highest_corr:.2f}")

# 2b. Compute and sort the average correlation of features
average_correlation = np.mean(np.abs(correlation_matrix), axis=1)
sorted_avg_corr_indices = np.argsort(average_correlation)[::-1]
print("\nAverage Correlation of Features in Descending Order:")
for i in sorted_avg_corr_indices:
    print(f"{feature_names[i]}: {average_correlation[i]:.5f}")


Feature Variance in Descending Order:
petal length (cm): 3.11628
sepal length (cm): 0.68569
petal width (cm): 0.58101
sepal width (cm): 0.18998

Correlation Matrix:
[[ 1.         -0.11756978  0.87175378  0.81794113]
 [-0.11756978  1.         -0.4284401  -0.36612593]
 [ 0.87175378 -0.4284401   1.          0.96286543]
 [ 0.81794113 -0.36612593  0.96286543  1.        ]]

Highest Correlation Pair: ('petal length (cm)', 'petal width (cm)') with value 0.96

Average Correlation of Features in Descending Order:
petal length (cm): 0.81576
petal width (cm): 0.78673
sepal length (cm): 0.70182
sepal width (cm): 0.47803
