## Data Preprocessing: Converting Categorical Data to Numerical

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('Prakriti_With_Features.csv')

# Separate features (X) and the target variable (y)
X = df.drop('Dosha', axis=1)
y_categorical = df['Dosha']

# --- Preprocess Features (X) ---
# One-hot encode the categorical features
X_numerical = pd.get_dummies(X, drop_first=True)

# Standardize the feature matrix (important for PCA and LDA)
scaler = StandardScaler()
X_std = scaler.fit_transform(X_numerical)

# --- Preprocess Target (y) for LDA ---
# Encode the categorical target variable into numbers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_categorical)

print(f"Original number of features after one-hot encoding: {X_std.shape[1]}")
print(f"Number of samples: {X_std.shape[0]}")

## 1. Reducing Features Using Principal Components (PCA)

In [None]:
from sklearn.decomposition import PCA

# Create a PCA instance to retain 99% of the variance
# whiten=True can help some algorithms by scaling components to have unit variance
pca = PCA(n_components=0.99, whiten=True)

# Conduct PCA on the standardized data
X_pca = pca.fit_transform(X_std)

# Show the results
print('Original number of features:', X_std.shape[1])
print('Reduced number of features:', X_pca.shape[1])

## 2. Reducing Features When Data Is Linearly Inseparable (Kernel PCA)

In [None]:
from sklearn.decomposition import KernelPCA

# Apply Kernel PCA with an RBF kernel
kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1)
X_kpca = kpca.fit_transform(X_std)

print('Original number of features:', X_std.shape[1])
print('Reduced number of features:', X_kpca.shape[1])

## 3. Reducing Features by Maximizing Class Separability (LDA)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Create an LDA that will reduce the data down to 1 feature
# Note: The number of components in LDA is at most n_classes - 1
lda = LinearDiscriminantAnalysis(n_components=1)

# Run LDA using both features (X_std) and the target (y)
X_lda = lda.fit(X_std, y).transform(X_std)

# Print the number of features
print('Original number of features:', X_std.shape[1])
print('Reduced number of features:', X_lda.shape[1])

# View the amount of variance explained by the component
print('\nVariance explained by the new feature:', lda.explained_variance_ratio_)