<a href="https://colab.research.google.com/github/WahyuKhairi06/DIF62130_A_25_2311531009_Wahyu-Khairi/blob/main/Praktikum%203/BernoulliNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Tahap 1: Memuat dan Memeriksa Dataset**

In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report


# Load dataset
file_path = "/content/alzheimers_disease_data.csv"
df = pd.read_csv(file_path)




# **Load dataset**

In [97]:
# Menampilkan informasi dataset
print(df.head())       # Melihat 5 baris pertama


   PatientID  Age  Gender  Ethnicity  EducationLevel        BMI  Smoking  \
0       4751   73       0          0               2  22.927749        0   
1       4752   89       0          0               0  26.827681        0   
2       4753   73       0          3               1  17.795882        0   
3       4754   74       1          0               1  33.800817        1   
4       4755   89       0          0               0  20.716974        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  MemoryComplaints  \
0           13.297218          6.327112     1.347214  ...                 0   
1            4.542524          7.619885     0.518767  ...                 0   
2           19.555085          7.844988     1.826335  ...                 0   
3           12.209266          8.428001     7.435604  ...                 0   
4           18.454356          6.310461     0.795498  ...                 0   

   BehavioralProblems       ADL  Confusion  Disorientation  \
0     

In [98]:
print(df.info())       # Melihat tipe data dan missing values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [99]:
print(df.describe())   # Statistik ringkasan

         PatientID          Age       Gender    Ethnicity  EducationLevel  \
count  2149.000000  2149.000000  2149.000000  2149.000000     2149.000000   
mean   5825.000000    74.908795     0.506282     0.697534        1.286645   
std     620.507185     8.990221     0.500077     0.996128        0.904527   
min    4751.000000    60.000000     0.000000     0.000000        0.000000   
25%    5288.000000    67.000000     0.000000     0.000000        1.000000   
50%    5825.000000    75.000000     1.000000     0.000000        1.000000   
75%    6362.000000    83.000000     1.000000     1.000000        2.000000   
max    6899.000000    90.000000     1.000000     3.000000        3.000000   

               BMI      Smoking  AlcoholConsumption  PhysicalActivity  \
count  2149.000000  2149.000000         2149.000000       2149.000000   
mean     27.655697     0.288506           10.039442          4.920202   
std       7.217438     0.453173            5.757910          2.857191   
min      15.00

# **Tahap 2: Preprocessing Dataset**

In [100]:
# Pilih fitur target (misalnya, kolom 'Diagnosis')
target_column = 'Diagnosis'  # Ganti dengan nama kolom yang sesuai
df[target_column] = df[target_column].map({'Alzheimer': 1, 'Healthy': 0})  # Ubah label ke 0/1

# Hapus kolom yang tidak relevan
df.drop(columns=['Patient_ID'], inplace=True, errors='ignore')
df.drop(columns=['DoctorInCharge'], inplace=True, errors='ignore')

# Konversi fitur BMI menjadi biner (BMI >= 25 -> 1)
df['BMI'] = (df['BMI'] >= 25).astype(int)

# AlcoholConsumption > 2 drinks/week -> 1
df['AlcoholConsumption'] = (df['AlcoholConsumption'] > 2).astype(int)

# PhysicalActivity >= 3 kali/minggu -> 1
df['PhysicalActivity'] = (df['PhysicalActivity'] >= 3).astype(int)

# DietQuality >= 7 -> 1 (bagus), else 0
df['DietQuality'] = (df['DietQuality'] >= 7).astype(int)

# FunctionalAssessment < 7 -> 1 (fungsi terganggu), else 0
df['FunctionalAssessment'] = (df['FunctionalAssessment'] < 7).astype(int)

# ADL < 6 -> 1 (butuh bantuan), else 0
df['ADL'] = (df['ADL'] < 6).astype(int)

# Binerisasi fitur numerik dengan threshold (contoh: Age >= 65)
df['Age'] = (df['Age'] >= 65).astype(int)

# Tampilkan dataset setelah preprocessing
print(df.head())


   PatientID  Age  Gender  Ethnicity  EducationLevel  BMI  Smoking  \
0       4751    1       0          0               2    0        0   
1       4752    1       0          0               0    1        0   
2       4753    1       0          3               1    0        0   
3       4754    1       1          0               1    1        1   
4       4755    1       0          0               0    0        0   

   AlcoholConsumption  PhysicalActivity  DietQuality  ...  \
0                   1                 1            0  ...   
1                   1                 1            0  ...   
2                   1                 1            0  ...   
3                   1                 1            1  ...   
4                   1                 1            0  ...   

   FunctionalAssessment  MemoryComplaints  BehavioralProblems  ADL  Confusion  \
0                     1                 0                   0    1          0   
1                     0                 0         

# **Tahap 3: Pisahkan Data untuk Training dan Testing**

In [101]:
# Pisahkan fitur (X) dan label (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Split data menjadi training (80%) dan testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (1719, 33)
Testing set size: (430, 33)


# **Tahap 4: Melatih Model Bernoulli Naïve Bayes**

In [102]:
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


from sklearn.impute import SimpleImputer
imputer_y = SimpleImputer(strategy='most_frequent')


if y_train.isnull().all():
    y_train = np.zeros_like(y_train)
else:
    y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1))
    y_train = y_train.ravel()


# Inisialisasi model BernoulliNB
model = BernoulliNB()

# Latih model dengan data training
model.fit(X_train, y_train)

print("Model telah dilatih!")

Model telah dilatih!


# **5. Evaluasi Model**

In [103]:
imputer_y_test = SimpleImputer(strategy='most_frequent')
# Check if y_test contains only NaN values
if y_test.isnull().all():
    # If all values are NaN, fill with a default value (e.g., 0) to avoid an empty array
    y_test_imputed = np.zeros_like(y_test, dtype=int)  # Use int type to match y_pred
else:
    y_test_imputed = imputer_y_test.fit_transform(y_test.values.reshape(-1, 1))
    y_test_imputed = y_test_imputed.ravel()


# Continue with your code
# Prediksi pada data testing
y_pred = model.predict(X_test)

# Calculate accuracy with the imputed y_test
accuracy = accuracy_score(y_test_imputed, y_pred)
print(f"Akurasi Model: {accuracy:.2f}")

# Tampilkan laporan klasifikasi
print(classification_report(y_test_imputed, y_pred))

Akurasi Model: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       430

    accuracy                           1.00       430
   macro avg       1.00      1.00      1.00       430
weighted avg       1.00      1.00      1.00       430

