In [2]:
# Step 1: Load Data
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv("diabetes.csv")

# Step 2: Preprocess
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
data[['Glucose','BMI']] = imputer.fit_transform(data[['Glucose','BMI']])

# Step 3: Feature Selection (IG)
from sklearn.feature_selection import mutual_info_classif
X = data.drop("Outcome", axis=1)
y = data["Outcome"]
ig = mutual_info_classif(X, y)
ig_scores = pd.Series(ig, index=X.columns).sort_values(ascending=False)

# Step 4: Apply Thresholds
thresholds = [0.01, 0.05, 0.1]
selected_features = {thresh: ig_scores[ig_scores >= thresh].index.tolist() for thresh in thresholds}

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

models = {
    "Decision Tree": DecisionTreeClassifier(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

results = []
for thresh, features in selected_features.items():
    X_sub = X[features]
    X_train, X_test, y_train, y_test = train_test_split(X_sub, y, test_size=0.3)
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results.append({
            "Threshold": thresh,
            "Model": name,
            "Accuracy": accuracy_score(y_test, y_pred)
        })