# IMPORT LIBRARIES 


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import pickle



# LOAD THE DATA 



In [7]:
file_path = "employee_daily_diagnose.csv.xlsx"  # make sure this is in your working directory
df = pd.read_excel(file_path)
df_clean = df.copy()

# Convert health parameters to float
numeric_columns = ['bmi', 'LDL', 'VLDL', 'HDlCholestrol', 'sugar_r', 'chol', 'tg']
df_clean[numeric_columns] = df_clean[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Extract systolic and diastolic from bp column
df_clean[['systolic_bp', 'diastolic_bp']] = df_clean['bp'].str.extract(r'(\d+)[^\d]+(\d+)', expand=True).astype(float)

df_clean.drop(columns=['bp'], inplace=True)

# STEP 4: CREATE TARGET LABELS
def classify_diabetes(sugar_r):
    if pd.isna(sugar_r):
        return np.nan
    elif sugar_r < 140:
        return 'Normal'
    elif 140 <= sugar_r < 200:
        return 'Pre-Diabetic'
    else:
        return 'Diabetic'

def classify_bp(sys, dia):
    if pd.isna(sys) or pd.isna(dia):
        return np.nan
    elif sys < 120 and dia < 80:
        return 'Normal'
    elif (120 <= sys < 140) or (80 <= dia < 90):
        return 'Pre-Hypertensive'
    else:
        return 'Hypertensive'

df_clean['diabetes_status'] = df_clean['sugar_r'].apply(classify_diabetes)
df_clean['bp_status'] = df_clean.apply(lambda row: classify_bp(row['systolic_bp'], row['diastolic_bp']), axis=1)

# STEP 5: DROP ROWS WITH MISSING TARGETS

df_model = df_clean.dropna(subset=['diabetes_status', 'bp_status'])

# STEP 6: DEFINE FEATURES AND LABELS

features = ['blood_group', 'year', 'plant', 'department', 'age', 'sex',
            'bmi', 'systolic_bp', 'diastolic_bp', 'sugar_r', 'chol', 'tg', 'LDL', 'VLDL']
X = df_model[features]
y_diabetes = df_model['diabetes_status']
y_bp = df_model['bp_status']

# STEP 7: ENCODING CATEGORICAL FEATURES
categorical_cols = ['blood_group', 'plant', 'department', 'sex']
encoders = {}

for col in categorical_cols:
    enc = LabelEncoder()
    X[col] = enc.fit_transform(X[col].astype(str))
    encoders[col] = enc

 #STEP 8: HANDLE MISSING VALUES
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


# STEP 9: SPLIT DATA
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_imputed, y_diabetes, test_size=0.2, random_state=42)
X_train_bp, X_test_bp, y_train_bp, y_test_bp = train_test_split(X_imputed, y_bp, test_size=0.2, random_state=42)

# STEP 10: TRAIN RANDOM FOREST MODELS
rf_diabetes = RandomForestClassifier(random_state=42)
rf_diabetes.fit(X_train_d, y_train_d)

rf_bp = RandomForestClassifier(random_state=42)

rf_bp.fit(X_train_bp, y_train_bp)
# STEP 11: EVALUATE
print(" Diabetes Model Accuracy:", rf_diabetes.score(X_test_d, y_test_d))
print(" BP Model Accuracy:", rf_bp.score(X_test_bp, y_test_bp))

# STEP 12: SAVE MODELS & PREPROCESSORS
with open("model_diabetes.pkl", "wb") as f:
    pickle.dump(rf_diabetes, f)

with open("model_bp.pkl", "wb") as f:
    pickle.dump(rf_bp, f)

with open("imputer.pkl", "wb") as f:
    pickle.dump(imputer, f)

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)







A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = enc.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = enc.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = enc.fit_transform(X[col].astype(str))
A value is trying to be set on a copy of a slice from a DataFram

 Diabetes Model Accuracy: 0.9998075442648191
 BP Model Accuracy: 0.9984603541185527
