In [1]:
# Simplified Training Script for COPD and ALT Models
import pandas as pd
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import lightgbm as lgb

print("Loading and preprocessing data...")

# Load data
df = pd.read_csv('/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/data/patient_data.csv')

# Apply ALT transformation (values > 50 are likely in different units)
condition = df['alanine_aminotransferase'] > 50
df.loc[condition, 'alanine_aminotransferase'] = df.loc[condition, 'alanine_aminotransferase'] / 1000

# Remove rows with missing values
df.isna().sum()


Loading and preprocessing data...


patient_id                                  0
age                                         0
sex                                         0
bmi                                         0
smoker                                      0
diagnosis_code                              0
medication_count                            0
days_hospitalized                           0
readmitted                                  0
last_lab_glucose                            0
exercise_frequency                       2012
diet_quality                                0
income_bracket                              0
education_level                          1040
urban                                       0
albumin_globulin_ratio                      0
chronic_obstructive_pulmonary_disease       0
alanine_aminotransferase                    0
dtype: int64

In [2]:
nan_percentage = df.isnull().mean()

# Keep only columns where NaN percentage is <= 15% (0.15)
df_cleaned = df.loc[:, nan_percentage <= 0.15]

# Alternative: Drop columns where NaN percentage is > 15%
df_cleaned = df.drop(columns=df.columns[nan_percentage > 0.15])

In [3]:
df_cleaned.dtypes

patient_id                                object
age                                        int64
sex                                       object
bmi                                      float64
smoker                                    object
diagnosis_code                            object
medication_count                           int64
days_hospitalized                          int64
readmitted                                 int64
last_lab_glucose                         float64
diet_quality                              object
income_bracket                            object
education_level                           object
urban                                      int64
albumin_globulin_ratio                   float64
chronic_obstructive_pulmonary_disease     object
alanine_aminotransferase                 float64
dtype: object

In [None]:

# Prepare features and targets
features = df_cleaned.drop(columns=['patient_id', 'chronic_obstructive_pulmonary_disease', 'alanine_aminotransferase'])
y_copd = df_cleaned['chronic_obstructive_pulmonary_disease']
y_alt = df_cleaned['alanine_aminotransferase']

# Encode categorical features using one-hot encoding
categorical_cols = features.select_dtypes(include=['object']).columns
features_encoded = pd.get_dummies(features, columns=categorical_cols, drop_first=True)

# Encode COPD target labels (convert text labels to numbers)
le = LabelEncoder()
y_copd_encoded = le.fit_transform(y_copd)

print(f"Dataset shape: {features_encoded.shape}")
print(f"Features: {len(features_encoded.columns)} total features")

# Split data into training and testing sets
X_train, X_test, y_copd_train, y_copd_test, y_alt_train, y_alt_test = train_test_split(
    features_encoded, y_copd_encoded, y_alt,
    test_size=0.2, random_state=42
)

print("\n=== Training COPD Classifier ===")

# Use LightGBM for feature selection (faster than RandomForest)
prelim_copd_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
prelim_copd_model.fit(X_train, y_copd_train)

# Get top 15 most important features for COPD prediction
feature_importance = pd.Series(prelim_copd_model.feature_importances_, index=X_train.columns)
top_copd_features = feature_importance.nlargest(10).index.tolist()
print(f"Selected {len(top_copd_features)} features for COPD model")

# Train final COPD classifier with selected features
X_train_copd = X_train[top_copd_features]
X_test_copd = X_test[top_copd_features]

copd_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
copd_classifier.fit(X_train_copd, y_copd_train)

# Evaluate COPD classifier
y_pred_copd = copd_classifier.predict(X_test_copd)
copd_accuracy = accuracy_score(y_copd_test, y_pred_copd)
print(f"COPD Classifier Accuracy: {copd_accuracy:.4f}")
print("\nCOPD Classification Report:")
print(classification_report(y_copd_test, y_pred_copd, target_names=le.classes_))

print("\n=== Training ALT Regressor ===")

# Feature selection for ALT prediction (separate from COPD)
prelim_alt_model = lgb.LGBMRegressor(random_state=42, verbose=-1)
prelim_alt_model.fit(X_train, y_alt_train)

# Get top 15 features for ALT prediction
feature_importance_alt = pd.Series(prelim_alt_model.feature_importances_, index=X_train.columns)
top_alt_features = feature_importance_alt.nlargest(10).index.tolist()
print(f"Selected {len(top_alt_features)} features for ALT model")

# Train final ALT regressor with selected features
X_train_alt = X_train[top_alt_features]
X_test_alt = X_test[top_alt_features]

alt_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
alt_regressor.fit(X_train_alt, y_alt_train)

# Evaluate ALT regressor
y_pred_alt = alt_regressor.predict(X_test_alt)
alt_mse = mean_squared_error(y_alt_test, y_pred_alt)
alt_r2 = r2_score(y_alt_test, y_pred_alt)
print(f"ALT Regressor MSE: {alt_mse:.4f}")
print(f"ALT Regressor R²: {alt_r2:.4f}")

print("\n=== Saving Models ===")

# Create directory for saved models
os.makedirs('saved_models', exist_ok=True)

# Save the two trained models
joblib.dump(copd_classifier, '/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/app/models/saved_models/copd_classifier.joblib')
joblib.dump(alt_regressor, '/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/app/models/saved_models/alt_regressor.joblib')

# Save preprocessing objects (essential for making predictions on new data)
joblib.dump(le, '/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/app/models/saved_models/copd_label_encoder.joblib')
joblib.dump(top_copd_features, '/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/app/models/saved_models/top_copd_features.joblib')
joblib.dump(top_alt_features, '/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/app/models/saved_models/top_alt_features.joblib')
joblib.dump(features_encoded.columns.tolist(), '/home/neo/Downloads/CODE_other_models/Loka/clinical-assistant/app/models/saved_models/encoded_columns.joblib')

print("✓ COPD classifier saved")
print("✓ ALT regressor saved")
print("✓ Preprocessing objects saved")
print(f"\nTraining complete! Models saved to 'saved_models/' directory")

Dataset shape: (10000, 20)
Features: 20 total features

=== Training COPD Classifier ===
Selected 10 features for COPD model
COPD Classifier Accuracy: 0.2730

COPD Classification Report:
              precision    recall  f1-score   support

           A       0.27      0.32      0.30       490
           B       0.28      0.27      0.28       519
           C       0.28      0.28      0.28       503
           D       0.26      0.22      0.24       488

    accuracy                           0.27      2000
   macro avg       0.27      0.27      0.27      2000
weighted avg       0.27      0.27      0.27      2000


=== Training ALT Regressor ===
Selected 10 features for ALT model
ALT Regressor MSE: 0.0128
ALT Regressor R²: 0.9995

=== Saving Models ===
✓ COPD classifier saved
✓ ALT regressor saved
✓ Preprocessing objects saved

Training complete! Models saved to 'saved_models/' directory
