# Missing Values Ratio

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score

diabetes_data = pd.read_csv('diabetes.csv')

# Check for missing values in the dataset
missing_values_ratio = diabetes_data.isnull().mean() * 100

# Remove features where missing values exceed 30%
features_to_remove = missing_values_ratio[missing_values_ratio > 30].index
reduced_data = diabetes_data.drop(columns=features_to_remove)

print(f"Features removed due to missing values: {features_to_remove}")
print(f"Reduced dataset shape: {reduced_data.shape}")

Features removed due to missing values: Index([], dtype='object')
Reduced dataset shape: (768, 9)


## High Correlation Filter

In [None]:
# Calculate the correlation matrix
correlation_matrix = diabetes_data.corr()

# Find pairs of features with correlation > 0.8
for feature in correlation_matrix.columns:
    high_corr = correlation_matrix[feature][correlation_matrix[feature] > 0.8]

    # Only show correlations between different features
    high_corr = high_corr[high_corr.index != feature]
    if not high_corr.empty:
        print(f"Highly correlated with {feature}:\n{high_corr}\n")

## Low Variance Filter

In [None]:
# Calculate variance for each feature
variance = diabetes_data.var()

# Set a threshold for low variance
threshold = 0.01

# Remove features with variance below the threshold
low_variance_features = variance[variance < threshold].index
reduced_data = diabetes_data.drop(low_variance_features, axis=1)

print(f"Features removed due to low variance: {low_variance_features}")


Features removed due to low variance: Index([], dtype='object')


## Forward Feature Selection

In [None]:
# Prepare features (X) and target (y)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

y_train = y_train.ravel()  # ravel(): returns flattened array
y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

# Build Random Forest classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# Forward Feature Selection
sfs_forward = SequentialFeatureSelector(clf,
           n_features_to_select=7,  # feature selected can be changed
           direction='forward',
           scoring='accuracy',
           cv=5)

# Perform Forward Feature Selection
sfs_forward.fit(X_train, y_train)

# Get selected features
selected_features_forward = sfs_forward.get_support(indices=True)
print(f"Selected features from forward selection: {df.columns[selected_features_forward].tolist()}")

# Train and evaluate the model using the selected features
X_train_selected_forward = sfs_forward.transform(X_train)
X_test_selected_forward = sfs_forward.transform(X_test)

clf.fit(X_train_selected_forward, y_train)
y_pred_forward = clf.predict(X_test_selected_forward)
accuracy_forward = accuracy_score(y_test, y_pred_forward)
print(f"Model accuracy with selected features (forward selection): {accuracy_forward}")


Training dataset shape: (576, 8) (576,)
Testing dataset shape: (192, 8) (192,)
Selected features from forward selection: ['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Model accuracy with selected features (forward selection): 0.734375


## Backward Feature Elimination

In [None]:
# Prepare features (X) and target (y)
X = df1.iloc[:, :-1].values
y = df1.iloc[:, -1].values

# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

y_train = y_train.ravel()  # ravel(): returns a contiguous flattened array.
y_test = y_test.ravel()

print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

# Build Random Forest classifier to use in feature selection
clf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# Backward Feature Selection
sfs_backward = SequentialFeatureSelector(clf1,
           n_features_to_select=7,  # feature selected can be changed
           direction='backward',
           scoring='accuracy',
           cv=5)

# Perform Backward Feature Selection
sfs_backward.fit(X_train, y_train)

# Get selected features
selected_features_backward = sfs_backward.get_support(indices=True)
print(f"Selected features from backward selection: {df.columns[selected_features_backward].tolist()}")

# Train and evaluate the model using the selected features
X_train_selected_backward = sfs_backward.transform(X_train)
X_test_selected_backward = sfs_backward.transform(X_test)

clf1.fit(X_train_selected_backward, y_train)
y_pred_backward = clf.predict(X_test_selected_backward)
accuracy_backward = accuracy_score(y_test, y_pred_backward)
print(f"Model accuracy with selected features (backward selection): {accuracy_backward}")

Training dataset shape: (576, 8) (576,)
Testing dataset shape: (192, 8) (192,)
Selected features from backward selection: ['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Model accuracy with selected features (backward selection): 0.734375


## Random Forest

In [None]:
# Prepare features (X) and target (y)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Build Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# Fit the model on the training data
clf.fit(X_train, y_train)

# Get feature importance scores
importances = clf.feature_importances_

# Create a DataFrame to view features and their importance scores
feature_importances = pd.DataFrame({
    'Feature': df.columns[:-1],  # Exclude target variable
    'Importance': importances
})

# Sort features by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print ranked features
print("Feature Importance Scores:")
print(feature_importances)

# Select top 5 most important features
top_5_features = feature_importances.head(5)['Feature'].values
print(f"\nTop 5 important features: {top_5_features}")

# Transform the dataset to keep only the top 5 features
X_train_top5 = X_train[:, feature_importances.head(5).index]
X_test_top5 = X_test[:, feature_importances.head(5).index]

# Train and evaluate the model using top 5 features
clf.fit(X_train_top5, y_train)
y_pred_top5 = clf.predict(X_test_top5)
accuracy_top5 = accuracy_score(y_test, y_pred_top5)

print(f"\nModel accuracy with top 5 important features: {accuracy_top5}")


Feature Importance Scores:
                    Feature  Importance
1                   Glucose    0.278761
5                       BMI    0.158058
7                       Age    0.139093
6  DiabetesPedigreeFunction    0.118225
2             BloodPressure    0.088883
0               Pregnancies    0.074491
3             SkinThickness    0.071575
4                   Insulin    0.070913

Top 5 important features: ['Glucose' 'BMI' 'Age' 'DiabetesPedigreeFunction' 'BloodPressure']

Model accuracy with top 5 important features: 0.7395833333333334
