In [None]:
!pip install pandas

In [None]:
!pip install openpyxl

In [None]:
import pandas as pd
import numpy as np

data = pd.read_excel("OSA_extreme_male_Gemini.xlsx")

print("Shape:", data.shape)
print(data.head())
print(data.columns)

In [None]:
feat_cols = ["Age","Height","Weight","BMI","Cervical"]

X_clf = data[feat_cols]
y_clf = (data["OSA"] == "Severe").astype(int)

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

cv_clf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

clf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000))
])

clf_scores = cross_validate(
    clf_pipe,
    X_clf,
    y_clf,
    cv=cv_clf,
    scoring=["accuracy", "f1", "roc_auc"]
)

print("CLF CV Acc:", clf_scores["test_accuracy"].mean())
print("CLF CV F1 :", clf_scores["test_f1"].mean())
print("CLF CV AUC:", clf_scores["test_roc_auc"].mean())

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

feat_cols = ["Age","Height","Weight","BMI","Cervical"]
X_reg = data[feat_cols]
y_reg = data["IAH"]

cv_reg = KFold(n_splits=5, shuffle=True, random_state=42)

reg_models = {
    "SVR_RBF": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVR(kernel="rbf", C=10, gamma="scale"))
    ]),
    "RF": RandomForestRegressor(
        n_estimators=500,
        max_depth=None,
        random_state=42
    )
}

print("=== NONLINEAR REGRESSION ===")
for name, model in reg_models.items():
    scores = cross_validate(
        model, X_reg, y_reg, cv=cv_reg,
        scoring=["neg_mean_absolute_error",
                 "neg_root_mean_squared_error",
                 "r2"]
    )
    print(f"\n{name}")
    print(" MAE :", -scores["test_neg_mean_absolute_error"].mean())
    print(" RMSE:", -scores["test_neg_root_mean_squared_error"].mean())
    print(" R2  :", scores["test_r2"].mean())


In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate

X_clf = data[feat_cols]
y_clf = (data["OSA"] == "Severe").astype(int)

cv_clf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

clf_models = {
    "SVM_RBF": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(kernel="rbf", C=10, probability=True))
    ]),
    "RF": RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        random_state=42
    )
}

print("\n=== NONLINEAR CLASSIFICATION ===")
for name, model in clf_models.items():
    scores = cross_validate(
        model, X_clf, y_clf, cv=cv_clf,
        scoring=["accuracy","f1","roc_auc"]
    )
    print(f"\n{name}")
    print(" Acc:", scores["test_accuracy"].mean())
    print(" F1 :", scores["test_f1"].mean())
    print(" AUC:", scores["test_roc_auc"].mean())


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

y_clf = (data["OSA"] == "Severe").astype(int)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

feature_sets = {
    "BMI_only": ["BMI"],
    "BMI_Cervical": ["BMI", "Cervical"],
    "All": ["Age","Height","Weight","BMI","Cervical"]
}

for name, cols in feature_sets.items():
    X = data[cols]
    pipe = Pipeline([("scaler", StandardScaler()),
                     ("model", LogisticRegression(max_iter=2000))])
    scores = cross_validate(pipe, X, y_clf, cv=cv, scoring=["accuracy","f1","roc_auc"])
    print(f"\n{name} ({cols})")
    print(" Acc:", scores["test_accuracy"].mean())
    print(" F1 :", scores["test_f1"].mean())
    print(" AUC:", scores["test_roc_auc"].mean())


In [None]:
!pip install seaborn


In [None]:
print(data.columns)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x="OSA", y="IAH", data=data)
plt.xlabel("OSA Severity")
plt.ylabel("AHI")
plt.tight_layout()
plt.savefig("plot_iah_box_by_class.png")
plt.show()


In [None]:
sns.pairplot(data[["BMI", "Cervical", "IAH"]])
plt.savefig("plot_pairplot_bmi_cervical_iah.png")
plt.show()

In [None]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Features and target (NO IAH in features!)
X = data[["Age", "Height", "Weight", "BMI", "Cervical"]]
y = data["IAH"]

cv = KFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "MAE": "neg_mean_absolute_error",
    "RMSE": "neg_root_mean_squared_error",
    "R2": "r2"
}

models = {
    "Linear Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]),
    "SVR (RBF)": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVR(kernel="rbf"))
    ]),
    "Random Forest": RandomForestRegressor(
        n_estimators=200,
        random_state=42
    )
}

results = {}

for name, model in models.items():
    cv_res = cross_validate(model, X, y, cv=cv, scoring=scoring)
    results[name] = {
        "MAE": -np.mean(cv_res["test_MAE"]),
        "RMSE": -np.mean(cv_res["test_RMSE"]),
        "R2": np.mean(cv_res["test_R2"])
    }

results


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(x="Cervical", y="IAH", data=data)
plt.tight_layout()
plt.savefig("plot_cervical_vs_iah.png")
plt.show()
