**Load a Sample**

In [None]:
import pandas as pd

# Load the diabetes dataset (make sure the file is in your working directory)
df = pd.read_csv("/content/Diabetes (1).csv")
print(df.head())


   pregnancies  glucose  diastolic  triceps  insulin   bmi    dpf  age  \
0            6      148         72       35        0  33.6  0.627   50   
1            1       85         66       29        0  26.6  0.351   31   
2            8      183         64        0        0  23.3  0.672   32   
3            1       89         66       23       94  28.1  0.167   21   
4            0      137         40       35      168  43.1  2.288   33   

   diabetes  
0         1  
1         0  
2         1  
3         0  
4         1  


 **Experiment 2: Create Sample Questionnaire-Based Domain Specific Dataset**

In [None]:
import pandas as pd

# Creating a simple domain-specific dataset
data = {
    "Age Group": ["18-25", "26-40", "41-60", "60+"],
    "Exercise Regularly": ["Yes", "No", "Yes", "No"],
    "Smokes": ["No", "Yes", "Yes", "No"],
    "Has Diabetes": ["No", "No", "Yes", "Yes"]
}

questionnaire_df = pd.DataFrame(data)
print(questionnaire_df)


  Age Group Exercise Regularly Smokes Has Diabetes
0     18-25                Yes     No           No
1     26-40                 No    Yes           No
2     41-60                Yes    Yes          Yes
3       60+                 No     No          Yes


**Experiment 3: Data Cleaning Techniques**

In [None]:
import numpy as np

# Replace 0s in specific columns with NaN and then impute using median
cols_with_zero_invalid = ['glucose', 'diastolic', 'triceps', 'insulin', 'bmi']
df[cols_with_zero_invalid] = df[cols_with_zero_invalid].replace(0, np.nan)
df.fillna(df.median(), inplace=True)

print(df.isnull().sum())


pregnancies    0
glucose        0
diastolic      0
triceps        0
insulin        0
bmi            0
dpf            0
age            0
diabetes       0
dtype: int64


**Experiment 4: Calculate Information Gain**

In [None]:
from sklearn.feature_selection import mutual_info_classif

X = df.drop("diabetes", axis=1)
y = df["diabetes"]

info_gain = mutual_info_classif(X, y)

for col, score in zip(X.columns, info_gain):
    print(f"Information Gain for {col}: {score:.4f}")


Information Gain for pregnancies: 0.0301
Information Gain for glucose: 0.1329
Information Gain for diastolic: 0.0282
Information Gain for triceps: 0.0000
Information Gain for insulin: 0.0243
Information Gain for bmi: 0.0997
Information Gain for dpf: 0.0165
Information Gain for age: 0.0786


**Experiment 5: Implement Naive Bayes, Decision Tree, and KNN**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Clean the dataset
cols_with_zero_invalid = ['glucose', 'diastolic', 'triceps', 'insulin', 'bmi']
df[cols_with_zero_invalid] = df[cols_with_zero_invalid].replace(0, pd.NA)
df.fillna(df.median(), inplace=True)

# Prepare data
X = df.drop("diabetes", axis=1)
y = df["diabetes"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Naive Bayes ---
nb = GaussianNB()
nb.fit(X_train, y_train)
pred_nb = nb.predict(X_test)
print("===== Naive Bayes Classification Report =====")
print(classification_report(y_test, pred_nb))

# --- Decision Tree ---
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)
print("===== Decision Tree Classification Report =====")
print(classification_report(y_test, pred_dt))

# --- K-Nearest Neighbors ---
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
print("===== KNN Classification Report =====")
print(classification_report(y_test, pred_knn))


===== Naive Bayes Classification Report =====
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

===== Decision Tree Classification Report =====
              precision    recall  f1-score   support

           0       0.78      0.77      0.77        99
           1       0.59      0.60      0.59        55

    accuracy                           0.71       154
   macro avg       0.68      0.68      0.68       154
weighted avg       0.71      0.71      0.71       154

===== KNN Classification Report =====
              precision    recall  f1-score   support

           0       0.77      0.71      0.74        99
           1       0.54      0.62      0.58        55

    accuracy                           0.68       

**Experiment 6: Compare Classification Results**

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy - Naive Bayes:", accuracy_score(y_test, pred_nb))
print("Accuracy - Decision Tree:", accuracy_score(y_test, pred_dt))
print("Accuracy - KNN:", accuracy_score(y_test, pred_knn))


Accuracy - Naive Bayes: 0.7532467532467533
Accuracy - Decision Tree: 0.7077922077922078
Accuracy - KNN: 0.6753246753246753


**Experiment 7: Outlier Detection and Removal**

In [None]:
# Use IQR method to detect outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df_no_outliers = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

print("Original shape:", df.shape)
print("After removing outliers:", df_no_outliers.shape)


Original shape: (768, 9)
After removing outliers: (375, 9)


**Experiment 8: Apriori Algorithm for Frequent Itemset**

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Convert numerical features into categorical for transaction format
df_apriori = df.copy()
df_apriori = df_apriori.apply(lambda x: pd.cut(x, bins=2, labels=["Low", "High"]))

transactions = df_apriori.astype(str).values.tolist()

te = TransactionEncoder()
te_ary = te.fit_transform(transactions)

df_trans = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df_trans, min_support=0.3, use_colnames=True)

print(frequent_itemsets)


    support     itemsets
0  0.733073       (High)
1  1.000000        (Low)
2  0.733073  (Low, High)


**Experiment 9: Basic Statistics Using a Statistical Tool (Pandas)**

In [None]:
# Describe basic statistical information
print(df.describe())


       pregnancies     glucose   diastolic     triceps     insulin  \
count   768.000000  768.000000  768.000000  768.000000  768.000000   
mean      3.845052  121.656250   72.386719   29.108073  140.671875   
std       3.369578   30.438286   12.096642    8.791221   86.383060   
min       0.000000   44.000000   24.000000    7.000000   14.000000   
25%       1.000000   99.750000   64.000000   25.000000  121.500000   
50%       3.000000  117.000000   72.000000   29.000000  125.000000   
75%       6.000000  140.250000   80.000000   32.000000  127.250000   
max      17.000000  199.000000  122.000000   99.000000  846.000000   

              bmi         dpf         age    diabetes  
count  768.000000  768.000000  768.000000  768.000000  
mean    32.455208    0.471876   33.240885    0.348958  
std      6.875177    0.331329   11.760232    0.476951  
min     18.200000    0.078000   21.000000    0.000000  
25%     27.500000    0.243750   24.000000    0.000000  
50%     32.300000    0.372500   2