# FEATURE SELECTION

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Read the CSV file into a DataFrame
path = r"C:\SECB3203 PROJECT\alzheimer_dataset.csv"  
df = pd.read_csv(path)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Label encode non-numerical categorical variables ('Group' and 'M/F')
label_encoder = LabelEncoder()
df['Group'] = label_encoder.fit_transform(df['Group'])
df['M/F'] = label_encoder.fit_transform(df['M/F'])

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Display the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Set the threshold for correlation coefficient
correlation_threshold = 0.5  # You can adjust this threshold based on your requirements

# Identify highly correlated features
highly_correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_features.add(colname)

# Drop highly correlated features
df_selected = df.drop(highly_correlated_features, axis=1)

# Display the DataFrame after feature selection
print("\nDataFrame after feature selection using correlation coefficient:")
print(df_selected)


Original DataFrame:
           Group M/F  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV    ASF
0    Nondemented   M   87    14  2.0  27.0  0.0  1987  0.696  0.883
1    Nondemented   M   88    14  2.0  30.0  0.0  2004  0.681  0.876
2       Demented   M   75    12  2.0  23.0  0.5  1678  0.736  1.046
3       Demented   M   76    12  2.0  28.0  0.5  1738  0.713  1.010
4       Demented   M   80    12  2.0  22.0  0.5  1698  0.701  1.034
..           ...  ..  ...   ...  ...   ...  ...   ...    ...    ...
368     Demented   M   82    16  1.0  28.0  0.5  1693  0.694  1.037
369     Demented   M   86    16  1.0  26.0  0.5  1688  0.675  1.040
370  Nondemented   F   61    13  2.0  30.0  0.0  1319  0.801  1.331
371  Nondemented   F   63    13  2.0  30.0  0.0  1327  0.796  1.323
372  Nondemented   F   65    13  2.0  30.0  0.0  1333  0.801  1.317

[373 rows x 10 columns]

Correlation Matrix:
          Group       M/F       Age      EDUC       SES      MMSE       CDR  \
Group  1.000000 -0.143918 -0.049267  0.

In [7]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split

# Read the CSV file into a DataFrame
file_path = r"C:\SECB3203 PROJECT\alzheimer_dataset.csv"  # Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv(file_path)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Assuming 'Group' is your target variable
target_variable = 'Group'

# Separate features and target variable
X = df.drop(target_variable, axis=1)
y = df[target_variable]

# Convert non-numerical categorical variables to numerical using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in X.select_dtypes(include='object').columns:
    X[column] = label_encoder.fit_transform(X[column])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate Information Gain for each feature
info_gain = mutual_info_classif(X_train, y_train)

# Create a DataFrame to display feature names and their corresponding Information Gain
info_gain_df = pd.DataFrame({'Feature': X.columns, 'Information Gain': info_gain})
info_gain_df = info_gain_df.sort_values(by='Information Gain', ascending=False)

# Display the DataFrame sorted by Information Gain
print("\nFeature Information Gain:")
print(info_gain_df)

# Set a threshold for Information Gain
info_gain_threshold = 0.1  # You can adjust this threshold based on your requirements

# Select features with Information Gain above the threshold
selected_features = info_gain_df[info_gain_df['Information Gain'] > info_gain_threshold]['Feature']

# Display the selected features
print("\nSelected Features:")
print(selected_features)

# Create a DataFrame with only the selected features
df_selected = df[selected_features]

# Display the DataFrame after feature selection
print("\nDataFrame after feature selection using Information Gain:")
print(df_selected)

Original DataFrame:
           Group M/F  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV    ASF
0    Nondemented   M   87    14  2.0  27.0  0.0  1987  0.696  0.883
1    Nondemented   M   88    14  2.0  30.0  0.0  2004  0.681  0.876
2       Demented   M   75    12  2.0  23.0  0.5  1678  0.736  1.046
3       Demented   M   76    12  2.0  28.0  0.5  1738  0.713  1.010
4       Demented   M   80    12  2.0  22.0  0.5  1698  0.701  1.034
..           ...  ..  ...   ...  ...   ...  ...   ...    ...    ...
331     Demented   M   82    16  1.0  28.0  0.5  1693  0.694  1.037
332     Demented   M   86    16  1.0  26.0  0.5  1688  0.675  1.040
333  Nondemented   F   61    13  2.0  30.0  0.0  1319  0.801  1.331
334  Nondemented   F   63    13  2.0  30.0  0.0  1327  0.796  1.323
335  Nondemented   F   65    13  2.0  30.0  0.0  1333  0.801  1.317

[336 rows x 10 columns]

Feature Information Gain:
  Feature  Information Gain
5     CDR          0.649243
4    MMSE          0.278402
7    nWBV          0.163312


In [8]:
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Read the CSV file into a DataFrame
file_path = r"C:\SECB3203 PROJECT\alzheimer_dataset.csv"  # Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv(file_path)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Assuming 'Group' is your target variable
target_variable = 'Group'

# Separate features and target variable
X = df.drop(target_variable, axis=1)
y = df[target_variable]

# Convert non-numerical categorical variables to numerical using LabelEncoder
label_encoder = LabelEncoder()
for column in X.select_dtypes(include='object').columns:
    X[column] = label_encoder.fit_transform(X[column])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate chi-square statistics and p-values for each feature
chi2_stat, p_values = chi2(X_train, y_train)

# Create a DataFrame to display feature names, chi-square statistics, and p-values
chi2_df = pd.DataFrame({'Feature': X.columns, 'Chi2_Statistic': chi2_stat, 'P-Value': p_values})
chi2_df = chi2_df.sort_values(by='Chi2_Statistic', ascending=False)

# Display the DataFrame sorted by chi-square statistics
print("\nFeature Chi-Square Statistics and P-Values:")
print(chi2_df)

# Set a threshold for p-value
p_value_threshold = 0.05  # You can adjust this threshold based on your requirements

# Select features with p-values below the threshold
selected_features = chi2_df[chi2_df['P-Value'] < p_value_threshold]['Feature']

# Display the selected features
print("\nSelected Features:")
print(selected_features)

# Create a DataFrame with only the selected features
df_selected = df[selected_features]

# Display the DataFrame after feature selection
print("\nDataFrame after feature selection using Chi-Square:")
print(df_selected)

Original DataFrame:
           Group M/F  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV    ASF
0    Nondemented   M   87    14  2.0  27.0  0.0  1987  0.696  0.883
1    Nondemented   M   88    14  2.0  30.0  0.0  2004  0.681  0.876
2       Demented   M   75    12  2.0  23.0  0.5  1678  0.736  1.046
3       Demented   M   76    12  2.0  28.0  0.5  1738  0.713  1.010
4       Demented   M   80    12  2.0  22.0  0.5  1698  0.701  1.034
..           ...  ..  ...   ...  ...   ...  ...   ...    ...    ...
331     Demented   M   82    16  1.0  28.0  0.5  1693  0.694  1.037
332     Demented   M   86    16  1.0  26.0  0.5  1688  0.675  1.040
333  Nondemented   F   61    13  2.0  30.0  0.0  1319  0.801  1.331
334  Nondemented   F   63    13  2.0  30.0  0.0  1327  0.796  1.323
335  Nondemented   F   65    13  2.0  30.0  0.0  1333  0.801  1.317

[336 rows x 10 columns]

Feature Chi-Square Statistics and P-Values:
  Feature  Chi2_Statistic       P-Value
5     CDR       96.328765  9.730598e-23
4    MMSE     