In [None]:
##############################################################################
## EE559 Final Project ===> Mushroom Classification.
## Created by Sudesh Kumar Santhosh Kumar and Thejesh Chandar Rao.
## Date: 6th May, 2023
## Tested in Python 3.10.9 using conda environment version 22.9.0.
##############################################################################

# Final Project -> Classification of Mushrooms (EE - 559)

## Feature Engineering and Data Cleansing

# Importing all necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Loading the dataset v1 from Encoding Directory which has the Encoded Features. (Every feature is numerical)

In [None]:
train_data = pd.read_csv("./dataset/Encoded/mushroom_train_v1.csv")
print("Train Data Encoded-> 1: ")
train_data.head()


## Separating Input Features and class labels from the dataset. Also Encoding the class labels from "p" -> 1 and "e" -> 0

In [None]:
X_train = train_data.drop('class', axis=1)  # Select all the features except labels,
y_train = train_data['class']  # Select only the 'class' column.

class_map = {"e" : 0, "p" : 1}
y_train_enc = y_train.map(class_map)

In [None]:
categoricalFeatures = [col for col in X_train.columns if X_train[col].dtype == "object"]
numericalFeatures = [col for col in X_train.columns if X_train[col].dtype == "float64"]

classes, class_index, class_count = np.unique(y_train, return_index=True, return_counts=True, axis=None)

### Checking the Dataset for Categorical Features

In [None]:
print(f"Categorical Features: {categoricalFeatures}")
print(f"Numerical Features: {numericalFeatures}")
print(f"Total number of Categorical Features: {len(categoricalFeatures)}")
print(f"Total number of Numerical Features: {len(numericalFeatures)}")


### Computing the Cross-Correlation co-efficent between Features & Y_train and plotting the absolute values of pearson's co-efficent

In [None]:
# Calculate Pearson correlations between all features and output label
correlations = X_train.corrwith(y_train_enc, method='pearson')

# Sort the correlations by absolute value
correlations_sorted = correlations.abs().sort_values(ascending=False)

# Create the bar chart using seaborn
plt.figure(figsize=(10, 20))
sns.barplot(x=correlations_sorted.values, y=correlations_sorted.index, palette='viridis')
plt.xlabel('Pearson Correlation Co-efficient')
plt.ylabel('Feature')
plt.title('Correlation Co-efficients with Output Label')
plt.show()


### Considering the top features which have a threhold value of Pearson's co-efficient > threshold from a list thresholds

In [None]:
thresholds = [0.100, 0.125, 0.150, 0.175, 0.200]

for (i, threshold) in enumerate(thresholds):
    # Select the features with correlation coefficient greater than the threshold
    top_features = correlations_sorted[correlations_sorted > threshold].index.tolist()

    X_train_top_pearson = X_train[top_features]

    train_data_top_pearson = pd.concat([X_train_top_pearson, y_train_enc], axis=1)

    print(f"Pearson Co-efficient Technique version: {i+1} & Threshold value: {threshold}")
    print(f"Shape of Training Data after performing Feature Selection using Correlation: {train_data_top_pearson.shape}")
    print(f"Number of Features before performing Feature Selection using Correlation: {train_data.shape[1] - 1} features")
    print(f"Number of Features after performing Feature Selection using Correlation: {train_data_top_pearson.shape[1] - 1} features")
    print()

    ### Converting the data-frame with 37 features to csv file to perform Feature Transformation. (Saving this as v1)
    train_data_top_pearson.to_csv(f"./dataset/FeatureSelected/Pearson/mushroom_train_v{i+1}.csv", index=False)




## Use the D' number of features from the D = 147 features by training a simple linear model.

In [None]:
# Create an empty list to store the performance of each feature
feature_performances_linear = []

# Loop over all features in X_train
for feature in X_train.columns:
    
    # Create a new instance of the logistic regression model
    model = LogisticRegression(random_state=42)
    
    # Fit the model using the current feature only
    model.fit(X_train[[feature]], y_train)
    
    # Make predictions using the current feature only
    y_pred = model.predict(X_train[[feature]])
    
    # Calculate the accuracy of the model using the current feature only
    accuracy = accuracy_score(y_train, y_pred)
    
    # Store the performance of the current feature
    feature_performances_linear.append((feature, accuracy))

# Sort the features by their performance (accuracy)
feature_performances_linear = sorted(feature_performances_linear, key=lambda x: x[1], reverse=True)

In [None]:
d_primes_linear = [12, 8, 4, 2]

for i, d_prime in enumerate(d_primes_linear):
    top_features_linear = [x[0] for x in feature_performances_linear[:d_prime]]

    X_train_top_linear = X_train[top_features_linear]

    train_data_top_linear = pd.concat([X_train_top_linear, y_train_enc], axis=1)

    print(f"Linear Model Technique version: {i+1} & d_prime: {d_prime}")
    print(f"Shape of Training Data after performing Feature Selection using Linear Model: {train_data_top_linear.shape}")
    print(f"Number of Features before performing Feature Selection using Linear Model: {train_data.shape[1] - 1} features")
    print(f"Number of Features after performing Feature Selection using Linear Model: {train_data_top_linear.shape[1] - 1} features")
    print()

    ### Converting the data-frame with 37 features to csv file to perform Feature Transformation. (Saving this as v1)
    train_data_top_linear.to_csv(f"./dataset/FeatureSelected/LinearRegression/mushroom_train_v{i+1}.csv", index=False)

## 