In [1]:
# from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# drive.mount('/content/drive')

dataset_path = 'anova_selector.csv'
data = pd.read_csv(dataset_path)

X = data.drop('obesity', axis=1)  # Features
y = data['obesity']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the models
# 1. Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)

# 2. Logistic Regression
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)

# 3. Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# 4. Support Vector Machine (SVM)
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_classifier.predict(X_test)
y_pred_logistic = logistic_classifier.predict(X_test)
y_pred_rf = rf_classifier.predict(X_test)
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the models
# Calculate accuracy and print classification reports
accuracy_gb = accuracy_score(y_test, y_pred_gb)
report_gb = classification_report(y_test, y_pred_gb)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
report_logistic = classification_report(y_test, y_pred_logistic)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

# Print the evaluation results
print("Gradient Boosting Classifier:")
print(f"Accuracy: {accuracy_gb}")
print("Classification Report:")
print(report_gb)

print("Logistic Regression:")
print(f"Accuracy: {accuracy_logistic}")
print("Classification Report:")
print(report_logistic)

print("Random Forest Classifier:")
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:")
print(report_rf)

print("Support Vector Machine:")
print(f"Accuracy: {accuracy_svm}")
print("Classification Report:")
print(report_svm)

Gradient Boosting Classifier:
Accuracy: 0.9333333333333333
Classification Report:
              precision    recall  f1-score   support

          No       1.00      0.86      0.92         7
         Yes       0.89      1.00      0.94         8

    accuracy                           0.93        15
   macro avg       0.94      0.93      0.93        15
weighted avg       0.94      0.93      0.93        15

Logistic Regression:
Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

          No       0.83      0.71      0.77         7
         Yes       0.78      0.88      0.82         8

    accuracy                           0.80        15
   macro avg       0.81      0.79      0.80        15
weighted avg       0.80      0.80      0.80        15

Random Forest Classifier:
Accuracy: 0.8666666666666667
Classification Report:
              precision    recall  f1-score   support

          No       0.86      0.86      0.86         7
         Yes       

In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Load your methylation dataset
data = pd.read_csv('anova_selector.csv')

X = data.drop('obesity', axis=1)  # Features
y = data['obesity']  # Target variable

# Preprocess the data (standardize features)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize models
models = [
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Random Forest", RandomForestClassifier()),
    ("Support Vector Machine", SVC())
]

# Perform cross-validation
num_folds = 5  # Number of folds
results = []

for name, model in models:
    kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results.append((name, cv_results))

# Display the cross-validation results
for name, cv_results in results:
    print(f"{name}: Mean Accuracy = {cv_results.mean()}, Standard Deviation = {cv_results.std()}")


Gradient Boosting: Mean Accuracy = 0.9152380952380952, Standard Deviation = 0.08348421714594716
Logistic Regression: Mean Accuracy = 0.9714285714285715, Standard Deviation = 0.03499271061118824
Random Forest: Mean Accuracy = 0.9438095238095239, Standard Deviation = 0.05323119471421761
Support Vector Machine: Mean Accuracy = 0.9438095238095239, Standard Deviation = 0.05323119471421761
