# **Feature Engineering**

In [1]:
# Importing necessary libraries:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
# Reading the training dataset into a pandas dataframe:
data_train = pd.read_csv('mushroom_train.csv')
# Calculating the correlation of the numerical features:
corr_matrix = data_train.corr()
print("THe correlation between the numerical columns of the features-\n", corr_matrix)

THe correlation between the numerical columns of the features-
               cap-diameter  stem-height  stem-width
cap-diameter      1.000000     0.423171    0.695804
stem-height       0.423171     1.000000    0.436069
stem-width        0.695804     0.436069    1.000000


  corr_matrix = data_train.corr()


In [3]:
# Separating the categorical data:
categorical_col = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season','class']
categorical_data_train = data_train[categorical_col]
encoded_data_train = pd.get_dummies(categorical_data_train, prefix = categorical_col, prefix_sep = '_') # One-hot encoding.

**Standardizing the numerical features**

In [4]:
numerical_col_train = data_train[['cap-diameter', 'stem-height', 'stem-width']] # Separating the numerical columns.
scaler = StandardScaler()
numerical_col_scaled_train = scaler.fit_transform(numerical_col_train)
numerical_data_train = pd.DataFrame(numerical_col_scaled_train, columns = numerical_col_train.columns)

In [5]:
# Concatenating the columns (standardized numerical columns and the one-hot encoded categorical data) into one dataframe:
final_data_train = pd.concat([numerical_data_train, encoded_data_train], axis = 1)
df_temp = pd.DataFrame({'zeros': [0] * 42748})
df_temp.loc[final_data_train.iloc[:, -2] == 1, 'zeros'] = 0
df_temp.loc[final_data_train.iloc[:, -1] == 1, 'zeros'] = 1
final_data_train = final_data_train.drop(final_data_train.columns[-2:], axis = 1)
final_data_train_watch = pd.concat([final_data_train, df_temp], axis = 1)
final_data_train_watch = final_data_train_watch.rename(columns = {'<old_column_name>': 'label'})
new_data_train_watch = final_data_train_watch.reset_index()

# **Pearson's correlation**

In [6]:
# Separating the numerical columns:
cols_of_interest = ['stem-width', 'stem-height', 'cap-diameter']
corr_matrix = final_data_train.corr(method = 'pearson') # Computing the correlation matrix.
corr_filtered = corr_matrix[(corr_matrix.index.isin(cols_of_interest)) | (corr_matrix.columns.isin(cols_of_interest))] # Filtering out the correlation matrix to only include with 'stem-width', 'stem-height', or 'cap-diameter' columns.
# Creating a copy of the filtered correlation matrix to sort and to remove self-correlations:
corr_sorted = corr_filtered.abs().unstack().sort_values(ascending = False).reset_index()
corr_sorted = corr_sorted[corr_sorted['level_0'] != corr_sorted['level_1']]
corr_sorted.columns = ['Feature-1', 'Feature-2', 'Correlation coefficient']
corr_sorted_filtered = corr_sorted[(corr_sorted['Feature-1'].isin(cols_of_interest)) | (corr_sorted['Feature-2'].isin(cols_of_interest))]

# Printing the top 10 positively and top 10 negatively correlated features:
print("Top 10 positive correlations:\n", corr_sorted_filtered.head(10))
print("\nTop 10 negative correlations:\n", corr_sorted_filtered.tail(10)[::-1])

Top 10 positive correlations:
             Feature-1     Feature-2  Correlation coefficient
3        cap-diameter    stem-width                 0.695804
4          stem-width  cap-diameter                 0.695804
5         stem-height    stem-width                 0.436069
6          stem-width   stem-height                 0.436069
7         stem-height  cap-diameter                 0.423171
8        cap-diameter   stem-height                 0.423171
9         ring-type_m   stem-height                 0.406376
10  gill-attachment_p    stem-width                 0.400621
11  gill-attachment_p  cap-diameter                 0.353896
12        ring-type_f   stem-height                 0.332793

Top 10 negative correlations:
           Feature-1     Feature-2  Correlation coefficient
275    gill-color_o    stem-width                 0.000822
274     cap-color_e  cap-diameter                 0.000841
273    stem-color_u   stem-height                 0.000941
272     ring-type_l    stem-wi

In [7]:
new_data_train_watch = pd.concat([new_data_train_watch, df_temp], axis = 1)
# Reset the index of the DataFrame and creating a new column 'index':
new_data_train_watch.reset_index(inplace = True)
new_data_train_watch.rename(columns = {'index': 'index_column'}, inplace = True)

**Adding new features**

In [8]:
selected_rows_shape_m = new_data_train_watch[new_data_train_watch['ring-type_m'] == 1] # Selecting the rows where 'ring-type_m' is 1.
selected_data_m = pd.DataFrame(selected_rows_shape_m, columns = new_data_train_watch.columns) # Create a new dataframe with the selected rows.
selected_data_m = new_data_train_watch.loc[new_data_train_watch['ring-type_m'] == 1].copy()
# Computing the minimum, maximum and average of the corresponding numerical columns:
stem_width_min = selected_data_m['stem-width'].min()
stem_width_max = selected_data_m['stem-width'].max()
stem_width_mean = selected_data_m['stem-width'].mean()
selected_data_m['stem_width_ring_type_m_min'] = stem_width_min
selected_data_m['stem_width_ring_type_m_max'] = stem_width_max
selected_data_m['stem_width_ring_type_m_mean'] = stem_width_mean

selected_data_m = selected_data_m.rename(columns = {'zeros': 'label1'})
selected_data_m = selected_data_m.loc[:, ~selected_data_m.columns.duplicated()] # Deleting the duplicate <'label'> column.
selected_data_m = selected_data_m[['index_column', 'stem_width_ring_type_m_min', 'stem_width_ring_type_m_max', 'stem_width_ring_type_m_mean', 'label1']]

In [9]:
selected_rows_shape_p = new_data_train_watch[new_data_train_watch['gill-attachment_p'] == 1] 
selected_data = pd.DataFrame(selected_rows_shape_p, columns = new_data_train_watch.columns)
selected_data_p = new_data_train_watch.loc[new_data_train_watch['gill-attachment_p'] == 1].copy()

stem_width_min = selected_data_p['stem-width'].min()
stem_width_max = selected_data_p['stem-width'].max()
stem_width_mean = selected_data_p['stem-width'].mean()
selected_data_p['stem_width_gill_attachment_p_min'] = stem_width_min
selected_data_p['stem_width_gill_attachment_p_max'] = stem_width_max
selected_data_p['stem_width_gill_attachment_p_mean'] = stem_width_mean

selected_data_p = selected_data_p.rename(columns = {'zeros': 'label2'})
selected_data_p = selected_data_p.loc[:, ~selected_data_p.columns.duplicated()]
selected_data_p = selected_data_p[['index_column', 'stem_width_gill_attachment_p_min', 'stem_width_gill_attachment_p_max', 'stem_width_gill_attachment_p_mean', 'label2']]

In [10]:
selected_rows_shape_f = new_data_train_watch[new_data_train_watch['ring-type_f'] == 1]
selected_data = pd.DataFrame(selected_rows_shape_f, columns = new_data_train_watch.columns)
selected_data_f = new_data_train_watch.loc[new_data_train_watch['ring-type_f'] == 1].copy()

stem_height_min = selected_data_f['stem-height'].min()
stem_height_max = selected_data_f['stem-height'].max()
stem_height_mean = selected_data_f['stem-height'].mean()
selected_data_f['stem_height_ring_type_f_min'] = stem_height_min
selected_data_f['stem_height_ring_type_f_max'] = stem_height_max
selected_data_f['stem_height_ring_type_f_mean'] = stem_height_mean

selected_data_f = selected_data_f.rename(columns = {'zeros': 'label3'})
selected_data_f = selected_data_f.loc[:, ~selected_data_f.columns.duplicated()]
selected_data_f = selected_data_f[['index_column', 'stem_height_ring_type_f_min', 'stem_height_ring_type_f_max', 'stem_height_ring_type_f_mean', 'label3']]

In [11]:
selected_rows_shape_o = new_data_train_watch[new_data_train_watch['gill-color_o'] == 1]
selected_data = pd.DataFrame(selected_rows_shape_o, columns = new_data_train_watch.columns)
selected_data_o = new_data_train_watch.loc[new_data_train_watch['gill-color_o'] == 1].copy()

stem_width_min = selected_data_o['stem-width'].min()
stem_width_max = selected_data_o['stem-width'].max()
stem_width_mean = selected_data_o['stem-width'].mean()
selected_data_o['stem_width_gill_color_o_min'] = stem_width_min
selected_data_o['stem_width_gill_color_o_max'] = stem_width_max
selected_data_o['stem_width_gill_color_o_mean'] = stem_width_mean

selected_data_o = selected_data_o.rename(columns = {'zeros': 'label4'})
selected_data_o = selected_data_o.loc[:, ~selected_data_o.columns.duplicated()]
selected_data_o = selected_data_o[['index_column', 'stem_width_gill_color_o_min', 'stem_width_gill_color_o_max', 'stem_width_gill_color_o_mean', 'label4']]

In [12]:
selected_rows_shape_e = new_data_train_watch[new_data_train_watch['cap-color_e'] == 1]
selected_data = pd.DataFrame(selected_rows_shape_e, columns = new_data_train_watch.columns)
selected_data_e = new_data_train_watch.loc[new_data_train_watch['cap-color_e'] == 1].copy()

cap_diameter_min = selected_data_e['cap-diameter'].min()
cap_diameter_max = selected_data_e['cap-diameter'].max()
cap_diameter_mean = selected_data_e['cap-diameter'].mean()
selected_data_e['cap_diameter_cap_color_e_min'] = cap_diameter_min
selected_data_e['cap_diameter_cap_color_e_max'] = cap_diameter_max
selected_data_e['cap_diameter_cap_color_e_mean'] = cap_diameter_mean

selected_data_e = selected_data_e.rename(columns = {'zeros': 'label5'})
selected_data_e = selected_data_e.loc[:, ~selected_data_e.columns.duplicated()]
selected_data_e = selected_data_e[['index_column', 'cap_diameter_cap_color_e_min', 'cap_diameter_cap_color_e_max', 'cap_diameter_cap_color_e_mean', 'label5']]

In [13]:
selected_rows_shape_p_only = new_data_train_watch[new_data_train_watch['cap-shape_p'] == 1]
selected_data = pd.DataFrame(selected_rows_shape_p_only, columns = new_data_train_watch.columns)
selected_data_p_only = new_data_train_watch.loc[new_data_train_watch['cap-shape_p'] == 1].copy()

cap_diameter_min = selected_data_p_only['cap-diameter'].min()
cap_diameter_max = selected_data_p_only['cap-diameter'].max()
cap_diameter_mean = selected_data_p_only['cap-diameter'].mean()
selected_data_p_only['cap_diameter_p_min'] = cap_diameter_min
selected_data_p_only['cap_diameter_p_max'] = cap_diameter_max
selected_data_p_only['cap_diameter_p_mean'] = cap_diameter_mean

selected_data_p_only = selected_data_p_only.rename(columns = {'zeros': 'label6'})
selected_data_p_only = selected_data_p_only.loc[:, ~selected_data_p_only.columns.duplicated()]
selected_data_p_only = selected_data_p_only[['index_column', 'cap_diameter_p_min', 'cap_diameter_p_max', 'cap_diameter_p_mean', 'label6']]

In [14]:
dataframes = [new_data_train_watch, selected_data_m, selected_data_p, selected_data_f, selected_data_o, selected_data_e, selected_data_p_only]
concatenated_df = pd.concat(dataframes, axis = 1, join = 'outer') # Concatenating all the created dataframes based on a common index.
concatenated_df['common_index'] = concatenated_df.index
concatenated_df.fillna(0, inplace = True) # Replacing any Not-a-Number values with zeroes.
concatenated_df.set_index('common_index', inplace = True)
concatenated_df = concatenated_df.drop('index_column', axis = 1)
concatenated_df = concatenated_df.drop('level_0', axis = 1) # Dropping the common index column.
print("DataFrame with the all augmented features:\n", concatenated_df.columns)

DataFrame with the all augmented features:
 Index(['cap-diameter', 'stem-height', 'stem-width', 'cap-shape_b',
       'cap-shape_c', 'cap-shape_f', 'cap-shape_o', 'cap-shape_p',
       'cap-shape_s', 'cap-shape_x',
       ...
       'stem_width_gill_color_o_mean', 'label4',
       'cap_diameter_cap_color_e_min', 'cap_diameter_cap_color_e_max',
       'cap_diameter_cap_color_e_mean', 'label5', 'cap_diameter_p_min',
       'cap_diameter_p_max', 'cap_diameter_p_mean', 'label6'],
      dtype='object', length=118)


In [15]:
# Defining a function to check if any of the label columns has '1':
def has_label(row):
    label_cols = ['label1', 'label2', 'label3', 'label4', 'label5', 'label6']
    
    for col in label_cols:
        
        if row[col] == 1:
            return 1
    
    return 0

In [16]:
concatenated_df['label'] = concatenated_df.apply(has_label, axis = 1)
concatenated_df = concatenated_df.drop(['label1', 'label2', 'label3', 'label4','label5','label6'], axis = 1)
concatenated_df = concatenated_df.drop(['zeros'], axis = 1)
concatenated_df_train = concatenated_df # Final training dataframe.
label_column_train = concatenated_df_train.iloc[:, -1] # Labels of the final training dataframe. 

# **Principal Component Analysis**

In [17]:
pca = PCA(n_components = 5) # Defining the number of features to pass to the PCA function.
pcs = pca.fit_transform(concatenated_df_train)
top_feature_names = []

for comps in range(pca.n_components_):
    component_loadings = abs(pca.components_[comps])
    sorted_indices = component_loadings.argsort()[::-1]
    top_feature_indices = sorted_indices[:pca.n_components]
    top_feature_names.append(concatenated_df_train.columns[top_feature_indices].tolist())

all_top_feature_names = [name for sublist in top_feature_names for name in sublist] # Concatenating the lists of top feature names into a single list.
feature_counts = pd.Series(all_top_feature_names).value_counts().sort_values(ascending = False) # Counting the frequency of each feature and sorting in descending order.
top_features = feature_counts.index[:pca.n_components].tolist() # Extracting the most frequent features.
print("The optimal features determined by PCA-\n", top_features)

The optimal features determined by PCA-
 ['cap-diameter', 'stem-height', 'stem_height_ring_type_f_max', 'stem_width_gill_attachment_p_max', 'stem-width']


In [18]:
# Accessing the indices of the columns that correspond to the top features and creating a new dataframe of only top features:
top_feature_indices = [concatenated_df_train.columns.get_loc(feature) for feature in top_features[:pca.n_components]]
top_features_data_train = concatenated_df_train.iloc[:, top_feature_indices]
print(top_features_data_train)

              cap-diameter  stem-height  stem_height_ring_type_f_max  \
common_index                                                           
0                -0.332216    -0.161276                      5.35784   
1                -0.742182    -0.274093                      0.00000   
2                 0.905345     0.132642                      0.00000   
3                 0.393845    -0.636294                      5.35784   
4                 0.160126     0.542345                      5.35784   
...                    ...          ...                          ...   
42743            -0.657890    -0.481913                      0.00000   
42744             0.420665    -0.585823                      5.35784   
42745             7.495450    -0.247373                      5.35784   
42746             0.803811     0.287022                      0.00000   
42747            -0.824558    -0.909429                      5.35784   

              stem_width_gill_attachment_p_max  stem-width  
co

**Standardizing the testing dataset**

In [19]:
data_test = pd.read_csv('mushroom_test.csv')
categorical_col = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season','class']
categorical_data_test = data_test[categorical_col]
encoded_data_test = pd.get_dummies(categorical_data_test, prefix = categorical_col, prefix_sep = '_') # One-hot encoding the categorical values.
numerical_col_test = data_test[['cap-diameter', 'stem-height', 'stem-width']]
scaler = StandardScaler()
numerical_col_scaled_test = scaler.fit_transform(numerical_col_test)
numerical_data_test = pd.DataFrame(numerical_col_scaled_test, columns = numerical_col_test.columns)
final_data_test = pd.concat([numerical_data_test, encoded_data_test], axis = 1)

In [20]:
df_temp = pd.DataFrame({'zeros': [0] * (final_data_test.shape[0])})
df_temp.loc[final_data_test.iloc[:,-2] == 1, 'zeros'] = 0
df_temp.loc[final_data_test.iloc[:,-1] == 1, 'zeros'] = 1
final_data_test = final_data_test.drop(final_data_test.columns[-2:], axis = 1)
final_data_test_watch = pd.concat([final_data_test, df_temp], axis = 1)
final_data_test_watch = final_data_test_watch.rename(columns={'<old_column_name>': 'label'})
new_data_test_watch= final_data_test.reset_index()
new_data_test_watch.rename(columns = {'index': 'index_column'}, inplace = True)
new_data_test_watch = pd.concat([new_data_test_watch, df_temp], axis = 1)
new_data_test_watch.reset_index(inplace = True)
new_data_test_watch.rename(columns = {'index': 'index_column'}, inplace = True)

**Adding new features to the testing dataset**

In [21]:
selected_rows_shape_m_test = new_data_test_watch[new_data_test_watch['ring-type_m'] == 1]
selected_data_m_test = pd.DataFrame(selected_rows_shape_m_test, columns = new_data_test_watch.columns)
selected_data_m_test = new_data_test_watch.loc[new_data_test_watch['ring-type_m'] == 1].copy()

stem_width_min_test = selected_data_m_test['stem-width'].min()
stem_width_max_test = selected_data_m_test['stem-width'].max()
stem_width_mean_test = selected_data_m_test['stem-width'].mean()
selected_data_m_test['stem_width_ring_type_m_min'] = stem_width_min_test
selected_data_m_test['stem_width_ring_type_m_max'] = stem_width_max_test
selected_data_m_test['stem_width_ring_type_m_mean'] = stem_width_mean_test

selected_data_m_test = selected_data_m_test.rename(columns = {'zeros': 'label1'})
selected_data_m_test = selected_data_m_test.loc[:, ~selected_data_m_test.columns.duplicated()]
selected_data_m_test = selected_data_m_test[['index_column', 'stem_width_ring_type_m_min', 'stem_width_ring_type_m_max', 'stem_width_ring_type_m_mean', 'label1']]

In [22]:
selected_rows_shape_p_test = new_data_test_watch[new_data_test_watch['gill-attachment_p'] == 1]
selected_data_test = pd.DataFrame(selected_rows_shape_p_test, columns = new_data_test_watch.columns)
selected_data_p_test = new_data_test_watch.loc[new_data_test_watch['gill-attachment_p'] == 1].copy()

stem_width_min_test = selected_data_p_test['stem-width'].min()
stem_width_max_test = selected_data_p_test['stem-width'].max()
stem_width_mean_test = selected_data_p_test['stem-width'].mean()
selected_data_p_test['stem_width_gill_attachment_p_min'] = stem_width_min
selected_data_p_test['stem_width_gill_attachment_p_max'] = stem_width_max
selected_data_p_test['stem_width_gill_attachment_p_mean'] = stem_width_mean

selected_data_p_test = selected_data_p_test.rename(columns = {'zeros': 'label2'})
selected_data_p_test = selected_data_p_test.loc[:, ~selected_data_p_test.columns.duplicated()]
selected_data_p_test = selected_data_p_test[['index_column', 'stem_width_gill_attachment_p_min', 'stem_width_gill_attachment_p_max', 'stem_width_gill_attachment_p_mean', 'label2']]

In [23]:
selected_rows_shape_f_test = new_data_test_watch[new_data_test_watch['ring-type_f'] == 1]
selected_data_test = pd.DataFrame(selected_rows_shape_f_test, columns = new_data_test_watch.columns)
selected_data_f_test = new_data_test_watch.loc[new_data_test_watch['ring-type_f'] == 1].copy()

stem_height_min_test = selected_data_f_test['stem-height'].min()
stem_height_max_test = selected_data_f_test['stem-height'].max()
stem_height_mean_test = selected_data_f_test['stem-height'].mean()
selected_data_f_test['stem_height_ring_type_f_min'] = stem_height_min_test
selected_data_f_test['stem_height_ring_type_f_max'] = stem_height_max_test
selected_data_f_test['stem_height_ring_type_f_mean'] = stem_height_mean_test

selected_data_f_test = selected_data_f_test.rename(columns = {'zeros': 'label3'})
selected_data_f_test = selected_data_f_test.loc[:, ~selected_data_f_test.columns.duplicated()]
selected_data_f_test = selected_data_f_test[['index_column', 'stem_height_ring_type_f_min', 'stem_height_ring_type_f_max', 'stem_height_ring_type_f_mean', 'label3']]

In [24]:
selected_rows_shape_o_test = new_data_test_watch[new_data_test_watch['gill-color_o'] == 1]
selected_data_test = pd.DataFrame(selected_rows_shape_o_test, columns = new_data_test_watch.columns)
selected_data_o_test = new_data_test_watch.loc[new_data_test_watch['gill-color_o'] == 1].copy()

stem_width_min_test = selected_data_o_test['stem-width'].min()
stem_width_max_test = selected_data_o_test['stem-width'].max()
stem_width_mean_test = selected_data_o_test['stem-width'].mean()
selected_data_o_test['stem_width_gill_color_o_min'] = stem_width_min_test
selected_data_o_test['stem_width_gill_color_o_max'] = stem_width_max_test
selected_data_o_test['stem_width_gill_color_o_mean'] = stem_width_mean_test

selected_data_o_test = selected_data_o_test.rename(columns = {'zeros': 'label4'})
selected_data_o_test = selected_data_o_test.loc[:, ~selected_data_o_test.columns.duplicated()]
selected_data_o_test = selected_data_o_test[['index_column', 'stem_width_gill_color_o_min', 'stem_width_gill_color_o_max', 'stem_width_gill_color_o_mean', 'label4']]

In [25]:
selected_rows_shape_e_test = new_data_test_watch[new_data_test_watch['cap-color_e'] == 1]
selected_data_test = pd.DataFrame(selected_rows_shape_e_test, columns = new_data_test_watch.columns)
selected_data_e_test = new_data_test_watch.loc[new_data_test_watch['cap-color_e'] == 1].copy()

cap_diameter_min_test = selected_data_e_test['cap-diameter'].min()
cap_diameter_max_test = selected_data_e_test['cap-diameter'].max()
cap_diameter_mean_test = selected_data_e_test['cap-diameter'].mean()
selected_data_e_test['cap_diameter_cap_color_e_min'] = cap_diameter_min_test
selected_data_e_test['cap_diameter_cap_color_e_max'] = cap_diameter_max_test
selected_data_e_test['cap_diameter_cap_color_e_mean'] = cap_diameter_mean_test

selected_data_e_test = selected_data_e_test.rename(columns = {'zeros': 'label5'})
selected_data_e_test = selected_data_e_test.loc[:, ~selected_data_e_test.columns.duplicated()]
selected_data_e_test = selected_data_e_test[['index_column', 'cap_diameter_cap_color_e_min', 'cap_diameter_cap_color_e_max', 'cap_diameter_cap_color_e_mean', 'label5']]

In [26]:
selected_rows_shape_p_only_test = new_data_test_watch[new_data_test_watch['cap-shape_p'] == 1]
selected_data_test = pd.DataFrame(selected_rows_shape_p_only_test, columns = new_data_test_watch.columns)
selected_data_p_only_test = new_data_test_watch.loc[new_data_test_watch['cap-shape_p'] == 1].copy()

cap_diameter_min_test = selected_data_p_only_test['cap-diameter'].min()
cap_diameter_max_test = selected_data_p_only_test['cap-diameter'].max()
cap_diameter_mean_test = selected_data_p_only_test['cap-diameter'].mean()
selected_data_p_only_test['cap_diameter_p_min'] = cap_diameter_min_test
selected_data_p_only_test['cap_diameter_p_max'] = cap_diameter_max_test
selected_data_p_only_test['cap_diameter_p_mean'] = cap_diameter_mean_test

selected_data_p_only_test = selected_data_p_only_test.rename(columns = {'zeros': 'label6'})
selected_data_p_only_test = selected_data_p_only_test.loc[:, ~selected_data_p_only_test.columns.duplicated()]
selected_data_p_only_test = selected_data_p_only_test[['index_column', 'cap_diameter_p_min', 'cap_diameter_p_max', 'cap_diameter_p_mean', 'label6']]

In [27]:
dataframes_test = [new_data_test_watch, selected_data_m_test, selected_data_p_test, selected_data_f_test, selected_data_o_test, selected_data_e_test, selected_data_p_only_test]
concatenated_df_test = pd.concat(dataframes_test, axis = 1, join = 'outer')
concatenated_df_test = concatenated_df_test.drop('index_column', axis = 1)

In [28]:
concatenated_df_test['common_index'] = concatenated_df_test.index
concatenated_df_test.fillna(0, inplace = True)
concatenated_df_test.set_index('common_index', inplace = True)

def has_label(row):
    label_cols = ['label1', 'label2', 'label3', 'label4', 'label5', 'label6']
    
    for col in label_cols:
        
        if row[col] == 1:
            return 1
    return 0

concatenated_df_test['label'] = concatenated_df_test.apply(has_label, axis = 1)
concatenated_df_test = concatenated_df_test.drop(['label1', 'label2', 'label3', 'label4','label5','label6'], axis = 1)
concatenated_df_test = concatenated_df_test.drop(['zeros'], axis = 1)
last_column_test = concatenated_df_test.iloc[:, -1]
last_column_test = concatenated_df_test.iloc[:, -1].reset_index(drop = True)

In [29]:
# Extracting the indices of the columns in top PCA features training dataset to append to the testing dataset:
top_feature_indices_test = [concatenated_df_test.columns.get_loc(feature) for feature in top_features[:pca.n_components]]
top_features_data_test = concatenated_df_test.iloc[:, top_feature_indices_test]
print("The final testing dataset-\n", top_features_data_test)

The final testing dataset-
               cap-diameter  stem-height  stem_height_ring_type_f_max  \
common_index                                                           
0                 1.335793     0.323775                     5.756592   
1                 2.051180    -0.055604                     5.756592   
2                -0.983626    -0.378668                     5.756592   
3                 1.592885    -0.420163                     5.756592   
4                 1.710253     5.036366                     0.000000   
...                    ...          ...                          ...   
18316             0.089454     0.409728                     0.000000   
18317             2.710678     3.231355                     5.756592   
18318            -0.769383     0.566814                     5.756592   
18319             1.484832     0.540139                     5.756592   
18320            -0.588673    -0.239365                     5.756592   

              stem_width_gill_attac

In [30]:
# Defining the X & y training components: 
X_train = top_features_data_train
true_labels = label_column_train
# Defining the X & y testing components:
X_test = top_features_data_test
y_test = pd.DataFrame(last_column_test)
y_test = y_test['label'].values

**Logistic Regression on PCA reduced dataset**

In [31]:
# Performing Logistic Regression on the final dataframes:
logreg = LogisticRegression(max_iter = 1000)
logreg.fit(X_train, true_labels)
# Predicting the class labels of the testing dataset:
predicted_labels = logreg.predict(X_test)
accuracy = accuracy_score(predicted_labels, y_test) # Calculating the accuracy.
print(f"The accuracy of logistic regression = {accuracy * 100}%")
print(classification_report(y_test, predicted_labels))

The accuracy of logistic regression = 60.95191310517984%
              precision    recall  f1-score   support

           0       0.74      0.38      0.50      9448
           1       0.56      0.86      0.68      8873

    accuracy                           0.61     18321
   macro avg       0.65      0.62      0.59     18321
weighted avg       0.65      0.61      0.59     18321



**Random Forest Classifier on PCA reduced dataset**

In [32]:
rf = RandomForestClassifier()
rf.fit(X_train, true_labels)
predicted_labels_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(predicted_labels_rf, y_test)
print(f"The accuracy of random forest classifier = {accuracy_rf * 100}%")
print(classification_report(y_test, predicted_labels_rf))

The accuracy of random forest classifier = 79.66268216800393%
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      9448
           1       0.78      0.81      0.79      8873

    accuracy                           0.80     18321
   macro avg       0.80      0.80      0.80     18321
weighted avg       0.80      0.80      0.80     18321



**Support Vector Machine on PCA reduced dataset**

In [33]:
svm = SVC(kernel = 'rbf', C = 1, gamma = 3)
svm.fit(X_train, true_labels)
predicted_labels_svm = svm.predict(X_test)
print(f"The accuracy of support vector machine = {accuracy_score(predicted_labels_svm, y_test) * 100}%")
print(classification_report(y_test, predicted_labels_svm))

The accuracy of support vector machine = 78.7948256099558%
              precision    recall  f1-score   support

           0       0.76      0.85      0.81      9448
           1       0.82      0.72      0.77      8873

    accuracy                           0.79     18321
   macro avg       0.79      0.79      0.79     18321
weighted avg       0.79      0.79      0.79     18321



# **Univariate Feature Selection**

In [34]:
ufs = SelectKBest(score_func = chi2, k = 10)
ufs.fit(abs(concatenated_df_train), label_column_train)
X_train_selected = ufs.transform(concatenated_df_train)
X_test_selected = ufs.transform(concatenated_df_test)

**Logistic Regression on UFS data**

In [39]:
lr = LogisticRegression()
lr.fit(X_train_selected, label_column_train)
predicted_labels_lr_ufs = lr.predict(X_test_selected)
accuracy_lr_ufs = accuracy_score(last_column_test, predicted_labels_lr_ufs)
print(f"The accuracy of logistic regression = {accuracy_lr_ufs * 100}%")
print(classification_report(last_column_test, predicted_labels_lr_ufs))

The accuracy of logistic regression = 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9448
           1       1.00      1.00      1.00      8873

    accuracy                           1.00     18321
   macro avg       1.00      1.00      1.00     18321
weighted avg       1.00      1.00      1.00     18321



**Support Vector Machine on UFS data**

In [40]:
svm_ufs = SVC(kernel = 'rbf', C = 1, gamma = 3)
svm_ufs.fit(X_train_selected, label_column_train)
predicted_labels_lr_svm = svm_ufs.predict(X_test_selected)
print(f"The accuracy of support vector machine = {accuracy_score(predicted_labels_lr_svm, last_column_test) * 100}%")
print(classification_report(last_column_test, predicted_labels_lr_svm))

The accuracy of support vector machine = 96.94339828611976%
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      9448
           1       1.00      0.94      0.97      8873

    accuracy                           0.97     18321
   macro avg       0.97      0.97      0.97     18321
weighted avg       0.97      0.97      0.97     18321



**Random Forest Classifier on UFS data**

In [41]:
rf_ufs = RandomForestClassifier()
rf_ufs.fit(X_train_selected, label_column_train)
predicted_labels_rf_ufs = rf_ufs.predict(X_test_selected)
accuracy_rf_ufs = accuracy_score(predicted_labels_rf_ufs, last_column_test)
print(f"The accuracy of random forest classifier = {accuracy_rf_ufs * 100}%")
print(classification_report(last_column_test, predicted_labels_rf_ufs))

The accuracy of random forest classifier = 100.0%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9448
           1       1.00      1.00      1.00      8873

    accuracy                           1.00     18321
   macro avg       1.00      1.00      1.00     18321
weighted avg       1.00      1.00      1.00     18321

