In [None]:
# Work 28: Logistic Regression Analysis on Mortality: Adjusting for BMI, CCI, HFRS, Birth Year, and Gender:
# [W28.LRM.1.Mortality.BMI.CCI.HFRS.Adjusted.V1.ipynb]

# "This notebook performs logistic regression analysis on mortality, adjusting for BMI, CCI,
#  HFRS, birth year, and gender, with separate models for men and women."

########################################################################################################
#  Sequence list
########################################################################################################
# 1: Load the Data: We load the dataset using pd.read_csv().
# 2: Ensure 'Potilas_ID' is the same type in all dataframes.
# 3: Ensure Columns are Numeric: Convert the CCI and HFRS columns to numeric values, coercing errors to handle any non-numeric data.
# 4: Drop Missing Values: Remove any rows that have missing values in either the CCI or HFRS columns to ensure the correlation analysis is accurate.
# 5: Merge the demographic data to add gender, birth year, and death information.
# 6: Create a new column indicating whether an individual is living or deceased.
# 7: Drop rows with missing values in BMI.
# 8: Define independent variables and the target variable.
# 9: Check if there is enough data to split.
# 10: Split the data into training and testing sets.
# 11: Fit a logistic regression model.
# 12: Make predictions on the test set.
# 13: Print classification report.
# 14: Perform logistic regression for men and women separately.

########################################################################################################
########################################################################################################

import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1: Load data
data_path = "/home/work/pp_all_data_with_cci.csv"
demo_path = "/home/work/demographicd.csv"
bmi_path = "/home/work/BMI_combined.csv"

data = pd.read_csv(data_path, dtype=str)
demo_df = pd.read_csv(demo_path, sep="|")
bmi_df = pd.read_csv(bmi_path, dtype=str)

print("1: Data loaded successfully.")

# 2: Ensure 'Potilas_ID' is the same type in all dataframes
data["Potilas_ID"] = data["Potilas_ID"].astype(str)
demo_df["Potilas_ID"] = demo_df["Potilas_ID"].astype(str)
bmi_df["Potilas_ID"] = bmi_df["Potilas_ID"].astype(str)

print("2: 'Potilas_ID' is now the same type in all dataframes.")

# 3: Ensure columns are numeric
data["CCI"] = pd.to_numeric(data["CCI"], errors="coerce")
data["HFRS"] = pd.to_numeric(data["HFRS"], errors="coerce")
bmi_df["BMI"] = pd.to_numeric(bmi_df["BMI"], errors="coerce")

print("3: Columns are numeric.")

# 4: Drop rows with missing values in CCI or HFRS
data = data.dropna(subset=["CCI", "HFRS"])

print("4: Missing values dropped.")

# 5: Merge demo_df and bmi_df to add gender and BMI information
data = data.merge(
    demo_df[["Potilas_ID", "Syntymävuosi", "Sukupuoli", "Kuolinvuosi"]], on="Potilas_ID", how="left"
)
data = data.merge(bmi_df[["Potilas_ID", "BMI"]], on="Potilas_ID", how="left")

print("5: Gender and BMI information merged successfully.")

# 6: Create a new column indicating whether an individual is living or deceased
data["Kuollut"] = data["Kuolinvuosi"].notna().astype(int)

print("6: Created mortality indicator.")

# 7: Drop rows with missing values in BMI
data = data.dropna(subset=["BMI"])

print("7: Missing BMI values dropped.")

# 8: Define independent variables and the target variable
X = data[["BMI", "CCI", "HFRS", "Syntymävuosi", "Sukupuoli"]]
X = pd.get_dummies(X, drop_first=True)  # Convert categorical variable to dummy/indicator variables
y = data["Kuollut"]

print("8: Defined independent variables and target variable.")

# 9: Check if there is enough data to split
if len(data) > 1:
    # 10: Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("9: Split the data into training and testing sets.")

    # 11: Fit a logistic regression model
    model = sm.Logit(y_train, sm.add_constant(X_train)).fit()

    print(model.summary())

    # 12: Make predictions on the test set
    y_pred = model.predict(sm.add_constant(X_test))
    y_pred_class = (y_pred > 0.5).astype(int)

    # 13: Print classification report
    print(classification_report(y_test, y_pred_class))

    print("10-13: Logistic regression model fitted and evaluated.")

    # 14: Perform logistic regression for men and women separately
    for gender in ["Mies", "Nainen"]:
        data_gender = data[data["Sukupuoli"] == gender]
        X_gender = data_gender[["BMI", "CCI", "HFRS", "Syntymävuosi"]]
        y_gender = data_gender["Kuollut"]
        X_gender = sm.add_constant(X_gender)

        if len(data_gender) > 1:
            X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
                X_gender, y_gender, test_size=0.2, random_state=42
            )

            model_gender = sm.Logit(y_train_g, X_train_g).fit()

            print(f"\nLogistic Regression Results for {gender}:")
            print(model_gender.summary())

            y_pred_g = model_gender.predict(X_test_g)
            y_pred_class_g = (y_pred_g > 0.5).astype(int)

            print(classification_report(y_test_g, y_pred_class_g))

    print("14: Logistic regression for men and women separately completed.")
else:
    print("14: Not enough data to split into training and testing sets.")

########################################################################################################
########################################################################################################

# 1: Data loaded successfully.
#
# 2: 'Potilas_ID' is now the same type in all dataframes.
#
# 3: Columns are numeric.
#
# 4: Missing values dropped.
#
# 5: Gender and BMI information merged successfully.
#
# 6: Created mortality indicator.
#
# 7: Missing BMI values dropped.
#
# 8: Defined independent variables and target variable.
#
# Not enough data to split into training and testing sets.