In [97]:
import pandas as pd

df = pd.read_csv("final_named_and_cleaned_data_0_5.csv")

In [98]:
df.drop(["id", "ccf"], axis=1, inplace=True)
df.drop(["ekgday", "ekgmo", "ekgyr"], axis=1, inplace=True)
df.drop(["cmo", "cday", "cyr"], axis=1, inplace=True)

In [99]:
# Uzupełniene danych medianą

num_cols = [
    "trestbps",
    "chol",
    "thaltime",
    "met",
    "thaldur",
    "thalach",
    "thalrest",
    "tpeakbps",
    "trestbpd",
    "tpeakbpd",
    "36",
    "oldpeak",
]


df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


def fill_data_rf(colmuns):
    for column in colmuns:
        if df[column].isna().sum() == 0:
            print(f"No missing values in {column}. Skipping.")
            continue

        known = df.dropna(subset=[column])
        unknown = df[df[column].isna()]

        if len(unknown) == 0:
            print(f"No valid rows to predict for {column}. Check feature missingness.")
            continue

        X_train = known[
            [
                "age",
                "sex",
                "chol",
                "trestbps",
                "dm",
                "thaldur",
                "thaltime",
                "met",
                "thalach",
                "thalrest",
                "tpeakbps",
                "trestbpd",
                "tpeakbpd",
                "36",
                "oldpeak",
            ]
        ]
        y_train = known[column]
        X_test = unknown[
            [
                "age",
                "sex",
                "chol",
                "trestbps",
                "dm",
                "thaldur",
                "thaltime",
                "met",
                "thalach",
                "thalrest",
                "tpeakbps",
                "trestbpd",
                "tpeakbpd",
                "36",
                "oldpeak",
            ]
        ]

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        df.loc[df[column].isna(), column] = model.predict(X_test)


def fill_data_logistic(colmuns):
    for column in colmuns:
        if df[column].isna().sum() == 0:
            print(f"No missing values in {column}. Skipping.")
            continue

        known = df.dropna(subset=[column])
        unknown = df[df[column].isna()]

        if len(unknown) == 0:
            print(f"No valid rows to predict for {column}. Check feature missingness.")
            continue

        X_train = known[
            [
                "age",
                "sex",
                "chol",
                "trestbps",
                "dm",
                "thaldur",
                "thaltime",
                "met",
                "thalach",
                "thalrest",
                "tpeakbps",
                "trestbpd",
                "tpeakbpd",
                "36",
                "oldpeak",
            ]
        ]
        y_train = known[column]
        X_test = unknown[
            [
                "age",
                "sex",
                "chol",
                "trestbps",
                "dm",
                "thaldur",
                "thaltime",
                "met",
                "thalach",
                "thalrest",
                "tpeakbps",
                "trestbpd",
                "tpeakbpd",
                "36",
                "oldpeak",
            ]
        ]

        model = LogisticRegression(solver="newton-cg", max_iter=1000000)
        model.fit(X_train, y_train)
        df.loc[df[column].isna(), column] = model.predict(X_test)

In [101]:
fill_data_logistic(
    [
        "painloc",
        "painexer",
        "relrest",
        "pncaden",
        "cp",
        "htn",
        "smoke",
        "fbs",
        "restecg",
        "dig",
        "prop",
        "nitr",
        "pro",
        "diuretic",
        "exang",
        "xhypo",
        "slope",
        "lmt",
        "ladprox",
        "laddist",
        "diag",
        "cxmain",
        "ramus",
        "om1",
        "om2",
        "rcaprox",
        "rcadist",
    ]
)

In [102]:
median_cigs = df.loc[(df["smoke"] == 1) & (df["cigs"].notna()), "cigs"].median()


df.loc[(df["smoke"] == 1) & (df["cigs"].isna()), "cigs"] = median_cigs

In [103]:
def calculate_framingham_score(row):
    points = 0

    if row["sex"] == 1:
        if 20 <= row["age"] <= 34:
            points += 0
        elif 35 <= row["age"] <= 39:
            points += 2
        elif 40 <= row["age"] <= 44:
            points += 5
        elif 45 <= row["age"] <= 49:
            points += 6
        elif 50 <= row["age"] <= 54:
            points += 8
        elif 55 <= row["age"] <= 59:
            points += 10
        elif 60 <= row["age"] <= 64:
            points += 11
        elif 65 <= row["age"] <= 69:
            points += 12
        elif 70 <= row["age"] <= 74:
            points += 14
        else:
            points += 15

    else:
        if 20 <= row["age"] <= 34:
            points += 0
        elif 35 <= row["age"] <= 39:
            points += 4
        elif 40 <= row["age"] <= 44:
            points += 5
        elif 45 <= row["age"] <= 49:
            points += 7
        elif 50 <= row["age"] <= 54:
            points += 8
        elif 55 <= row["age"] <= 59:
            points += 9
        elif 60 <= row["age"] <= 64:
            points += 10
        elif 65 <= row["age"] <= 69:
            points += 12
        elif 70 <= row["age"] <= 74:
            points += 14
        else:
            points += 16

    if row["chol"] < 160:
        points += 0
    elif 160 <= row["chol"] < 200:
        points += 1
    elif 200 <= row["chol"] < 240:
        points += 2
    else:
        points += 3

    if row["smoke"] == 1:
        points += 2

    if row["trestbps"] < 120:
        points += 0
    elif 120 <= row["trestbps"] < 130:
        points += 1
    elif 130 <= row["trestbps"] < 140:
        points += 2
    else:
        points += 3

    if row["dm"] == 1:
        points += 3

    return points

In [104]:
df["framingham_score"] = df.apply(calculate_framingham_score, axis=1)