In [None]:
# Work 29: Logistic Regression Analysis on Mortality Using CCI and HFRS: Adjusting for Demographics: 
# [29.LRM.2.Mortality.CCI.HFRS.Adjusted.V2.ipynb]

# "This notebook performs logistic regression analysis on mortality using CCI and HFRS, 
#  adjusting for demographics, and saves the processed data."

########################################################################################################
#  Sequence list
########################################################################################################
# 1: Load the Data: We load the dataset using pd.read_csv().
# 2: Ensure 'Potilas_ID' is the same type in all dataframes.
# 3: Ensure Columns are Numeric: Convert the CCI and HFRS columns to numeric values, coercing errors to handle any non-numeric data.
# 4: Drop Missing Values: Remove any rows that have missing values in either the CCI or HFRS columns to ensure the correlation analysis is accurate.
# 5: Merge the demographic data to add gender, birth and dead information.
# 6: Create a new column indicating whether an individual is living or deceased.
# 7: Define independent variables and the target variable.
# 8: Combine X and y to save as one dataframe.
# 9: Save the processed data for manual inspection.

########################################################################################################
########################################################################################################

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1: Load data
data_path = '/home/work/pp_all_data_with_cci.csv'
demo_path = '/home/work/demographicd.csv'
processed_data_path = '/home/work/processed_data_v6.csv'

data = pd.read_csv(data_path, dtype=str)
demo_df = pd.read_csv(demo_path, sep='|')

print("1: Data loaded successfully.")

# 2: Ensure 'Potilas_ID' is the same type in all dataframes
data['Potilas_ID'] = data['Potilas_ID'].astype(str)
demo_df['Potilas_ID'] = demo_df['Potilas_ID'].astype(str)

print("2: 'Potilas_ID' is now the same type in all dataframes.")

# 3: Ensure columns are numeric
data['CCI'] = pd.to_numeric(data['CCI'], errors='coerce')
data['HFRS'] = pd.to_numeric(data['HFRS'], errors='coerce')
demo_df['Syntymävuosi'] = pd.to_numeric(demo_df['Syntymävuosi'], errors='coerce')

print("3: Columns are numeric.")

# 4: Drop rows with missing values in CCI or HFRS
data = data.dropna(subset=['CCI', 'HFRS'])

print("4: Missing values dropped.")

# 5: Merge demo_df to add gender, birth and dead information
data = data.merge(demo_df[['Potilas_ID', 'Syntymävuosi', 'Sukupuoli', 'Kuolinvuosi']], on='Potilas_ID', how='left')

print("5: Gender and demographic information merged successfully.")

# 6: Create a new column indicating whether an individual is living or deceased
data['Kuollut'] = data['Kuolinvuosi'].notna().astype(int)

print("6: Created mortality indicator.")

# 7: Define independent variables and the target variable
X = data[['CCI', 'HFRS', 'Syntymävuosi', 'Sukupuoli']]
X = pd.get_dummies(X, columns=['Sukupuoli'], drop_first=True)  # Convert categorical variable to dummy/indicator variables
y = data['Kuollut']

# 8: Combine X and y to save as one dataframe
processed_data = pd.concat([X, y], axis=1)

# 9: Save the processed data for manual inspection
processed_data.to_csv(processed_data_path, index=False)

print(f"9: Processed data saved to {processed_data_path}")


