In [1]:
##############################################################################
## EE559 Final Project ===> Mushroom Classification.
## Created by Sudesh Kumar Santhosh Kumar and Thejesh Chandar Rao.
## Date: 6th May, 2023
## Tested in Python 3.10.9 using conda environment version 22.9.0.
##############################################################################

# Final Project -> Classification of Mushrooms (EE - 559)

## Feature Transformation.

# Importing all necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.Transform import standardizeData
from utils.Transform import transformTrainData_PCA
from utils.Transform import transformTrainData_LDA, transformTrainData_IterLDA

## Loading the all the datasets from FeatureSelected Directory which has the Pearson Co-efficient Technique based selected features.

In [3]:
# contains 131 features.
train_data_pearson_v1 = pd.read_csv("./dataset/FeatureSelected/Statistical/Pearson/mushroom_train_v1.csv")
X_train_pearson_v1 = train_data_pearson_v1.drop('class', axis=1)  # Select all the features except labels,

y_train = train_data_pearson_v1['class']  # Select only the 'class' column.


## Perform Transformation of Selected Features to lower dimensions using LDA.

### Standardizing the Input Features.

### Performing Principal Component Analysis and reducing the number of features before training the models.

In [4]:
# Setting the number of features as 15.
X_train_pearson_v1_PCA_15 = transformTrainData_PCA(X_train_pearson_v1, components=15)

# Setting the number of features as 30.
X_train_pearson_v1_PCA_30 = transformTrainData_PCA(X_train_pearson_v1, components=30)

# Setting the number of features as 45.
X_train_pearson_v1_PCA_45 = transformTrainData_PCA(X_train_pearson_v1, components=45)

# Setting the number of features as 60.
X_train_pearson_v1_PCA_60 = transformTrainData_PCA(X_train_pearson_v1, components=60)

# Setting the number of features as 90.
X_train_pearson_v1_PCA_90 = transformTrainData_PCA(X_train_pearson_v1, components=90)

## Converting the y_train Series and X_train numpy array to dataframe and generating a dataset for PCA.

In [5]:
# Stack the classes column-wise next to the features
train_data_pearson_v1_PCA_15 = np.hstack((X_train_pearson_v1_PCA_15, y_train.values.reshape(-1, 1)))
# Create a pandas DataFrame
df = pd.DataFrame(train_data_pearson_v1_PCA_15, columns=['feature 1', 'feature 2', "feature 3", "feature 4", "feature 5", "feature 6", "feature 7", "feature 8", "feature 9", "feature 10", "feature 11", "feature 12", "feature 13", "feature 14", "feature 15", 'class'])
# Save the DataFrame to a CSV file
df.to_csv(f"./dataset/FeatureTransformed/PCA/mushroom_train_pearson_v1_PCA_15.csv", index=False)



# Stack the classes column-wise next to the features
train_data_pearson_v1_PCA_30 = np.hstack((X_train_pearson_v1_PCA_30, y_train.values.reshape(-1, 1)))
# Create a pandas DataFrame
df = pd.DataFrame(train_data_pearson_v1_PCA_30, columns=['feature 1', 'feature 2', "feature 3", "feature 4", "feature 5", "feature 6", "feature 7", "feature 8", "feature 9", "feature 10", "feature 11", "feature 12", "feature 13", "feature 14", "feature 15", 'feature 16', 'feature 17', "feature 18", "feature 19", "feature 20", "feature 21", "feature 22", "feature 23", "feature 24", "feature 25", "feature 26", "feature 27", "feature 28", "feature 29", "feature 30", 'class'])
# Save the DataFrame to a CSV file
df.to_csv(f"./dataset/FeatureTransformed/PCA/mushroom_train_pearson_v1_PCA_30.csv", index=False)


# Stack the classes column-wise next to the features
train_data_pearson_v1_PCA_45 = np.hstack((X_train_pearson_v1_PCA_45, y_train.values.reshape(-1, 1)))
# Create a pandas DataFrame
df = pd.DataFrame(train_data_pearson_v1_PCA_45, columns=['feature 1', 'feature 2', "feature 3", "feature 4", "feature 5", "feature 6", "feature 7", "feature 8", "feature 9", "feature 10", "feature 11", "feature 12", "feature 13", "feature 14", "feature 15", 'feature 16', 'feature 17', "feature 18", "feature 19", "feature 20", "feature 21", "feature 22", "feature 23", "feature 24", "feature 25", "feature 26", "feature 27", "feature 28", "feature 29", "feature 30", 'feature 31', 'feature 32', "feature 33", "feature 34", "feature 35", "feature 36", "feature 37", "feature 38", "feature 39", "feature 40", "feature 41", "feature 42", "feature 43", "feature 44", "feature 45",'class'])# Save the DataFrame to a CSV file
df.to_csv(f"./dataset/FeatureTransformed/PCA/mushroom_train_pearson_v1_PCA_45.csv", index=False)


n_features = 60  # replace with the actual number of features
col_names = [f"feature {i}" for i in range(1, n_features + 1)]
col_names.append('class')

# Stack the classes column-wise next to the features
train_data_pearson_v1_PCA_60 = np.hstack((X_train_pearson_v1_PCA_60, y_train.values.reshape(-1, 1)))

# Create a pandas DataFrame
df = pd.DataFrame(train_data_pearson_v1_PCA_60, columns=col_names)
df.to_csv(f"./dataset/FeatureTransformed/PCA/mushroom_train_pearson_v1_PCA_60.csv", index=False)




n_features = 90  # replace with the actual number of features
col_names = [f"feature {i}" for i in range(1, n_features + 1)]
col_names.append('class')

# Stack the classes column-wise next to the features
train_data_pearson_v1_PCA_90 = np.hstack((X_train_pearson_v1_PCA_90, y_train.values.reshape(-1, 1)))

# Create a pandas DataFrame
df = pd.DataFrame(train_data_pearson_v1_PCA_90, columns=col_names)
df.to_csv(f"./dataset/FeatureTransformed/PCA/mushroom_train_pearson_v1_PCA_90.csv", index=False)

### Performing Linear Discriminant Analysis and reducing the number of features before training the models.

In [6]:
X_train_pearson_v1_LDA = transformTrainData_LDA(X_train_pearson_v1, y_train)

# Stack the classes column-wise next to the features
train_data_pearson_v1_LDA = np.hstack((X_train_pearson_v1_LDA, y_train.values.reshape(-1, 1)))

# Create a pandas DataFrame
df = pd.DataFrame(train_data_pearson_v1_LDA, columns=['feature 1', 'class'])

# Save the DataFrame to a CSV file
df.to_csv(f"./dataset/FeatureTransformed/LDA/mushroom_train_pearson_v1_LDA.csv", index=False)