In [3]:
# Install necessary packages
#!pip install symspellpy xgboost imbalanced-learn nltk pandas scikit-learn optuna --quiet

# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from symspellpy import SymSpell
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imPipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna

# Download required NLTK data
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('words')

# Initialize SymSpell object
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = nltk.data.find('corpora/words').path + '/en'
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, separator='\t')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text preprocessing functions
def correct_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    return suggestions[0].term if suggestions else text

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    processed_text = ' '.join(tokens)
    processed_text = correct_spelling(processed_text)
    return processed_text

# Load and preprocess dataset
df = pd.read_csv(r"C:\Users\HP\Downloads\targetfinaldataset1 (1).csv")
df = df.drop(df.columns[:3], axis=1)
df = df.drop(columns=['AdmissionStartDate', 'PatientDateOfBirth', 'PrimaryDiagnosisCode'])
df['PrimaryDiagnosisDescription'] = df['PrimaryDiagnosisDescription'].fillna('')

# Split the dataset into features and target
X = df.drop('RiskScore', axis=1)
y = df['RiskScore']

# Identify categorical, numerical, and text features
text_feature = 'PrimaryDiagnosisDescription'
categorical_features = X.select_dtypes(include=['object']).drop(columns=[text_feature]).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define pipelines for different types of data
text_pipeline = imPipeline(steps=[
    ('preprocess', FunctionTransformer(lambda x: x.apply(preprocess_text))),
    ('tfidf', TfidfVectorizer(max_features=500))
])

numerical_pipeline = imPipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = imPipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine all preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('text', text_pipeline, text_feature),
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Define the Optuna objective function
def objective(trial):
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 300),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0)
    }

    xgb_model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        **xgb_params
    )

    pipeline = imPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('undersample', RandomUnderSampler(random_state=42)),
        ('model', xgb_model)
    ])

    X_sub, _, y_sub, _ = train_test_split(X, y, test_size=0.8, random_state=42)
    X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_sub, y_sub, test_size=0.2, random_state=42)

    pipeline.fit(X_train_sub, y_train_sub)
    y_val_pred = pipeline.predict(X_val_sub)

    r2 = r2_score(y_val_sub, y_val_pred)
    return -r2

# Create an Optuna study and optimize it
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_trial = study.best_trial

print("Best R² score: ", -best_trial.value)
print("Best hyperparameters: ", best_trial.params)

xgb_best_params = {key.replace('xgb_', ''): value for key, value in best_trial.params.items()}

xgb_best = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    **xgb_best_params
)

pipeline_best = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('model', xgb_best)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_best.fit(X_train, y_train)

y_test_pred_best = pipeline_best.predict(X_test)

mae_best = mean_absolute_error(y_test, y_test_pred_best)
mse_best = mean_squared_error(y_test, y_test_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_test_pred_best)

print(f"Final MAE: {mae_best:.4f}")
print(f"Final MSE: {mse_best:.4f}")
print(f"Final RMSE: {rmse_best:.4f}")
print(f"Final R² Score: {r2_best:.4f}")



[nltk_data] Downloading package words to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[I 2024-08-18 14:38:11,684] A new study created in memory with name: no-name-ae19f9d8-dec5-440f-bc24-9283249e75d6
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
[I 2024-08-18 14:38:18,682] Trial 0 finished with value: -0.3315739035606384 and parameters: {'xgb_n_estimators': 113, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.003333460688136266, 'xgb_subsample': 0.7544847871862848, 'xgb_colsample_bytree': 0.8466279221852107}. Best is trial 0 with value: -0.3315739035606384.
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-4, 1e-1),
[I 2024-08-18 14:38:22,546] Trial 1 finished with value: -0.26663821935653687 and parameters: {'xgb_n_estimators': 131, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.0023956413056890943, 'xgb_subsample': 0.8223411612282591, 'xgb_colsample_bytree': 0.5920150753118457}. Be

Best R² score:  0.7274439930915833
Best hyperparameters:  {'xgb_n_estimators': 271, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.06722920125863578, 'xgb_subsample': 0.8345746700814414, 'xgb_colsample_bytree': 0.9369967529608694}
Final MAE: 0.2842
Final MSE: 0.1243
Final RMSE: 0.3526
Final R² Score: 0.7804

Sample Input:
      AdmissionID PatientGender  \
5852            2          Male   

                            PrimaryDiagnosisDescription  CBC_Lymphocytes  \
5852  Exanthema subitum [sixth disease] due to human...              1.3   

      CBC_Neutrophils  CBC_Basophils  CBC_Eosinophils  CBC_Hematocrit  \
5852              8.6            0.1              0.1            45.2   

      CBC_Hemoglobin  CBC_MCH  ...  METABOLIC_SODIUM  METABOLIC_TOTAL_PROTEIN  \
5852            18.1     37.8  ...             147.8                      9.4   

      URINALYSIS_PH  URINALYSIS_RED_BLOOD_CELLS  URINALYSIS_WHITE_BLOOD_CELLS  \
5852            7.4                         3.2                  

In [6]:
# Preprocess sample input and predict
def preprocess_sample(sample, preprocessor):
    # Transform sample using the preprocessor pipeline
    sample_preprocessed = preprocessor.transform(sample)
    return sample_preprocessed

sample_input = X_test.iloc[[0]]
sample_input_preprocessed = preprocess_sample(sample_input, preprocessor)
sample_prediction = xgb_best.predict(sample_input_preprocessed)

print("\nSample Input:")
print(sample_input)
print("\nSample Prediction:")
print(int(sample_prediction.round()))



Sample Input:
      AdmissionID PatientGender  \
5852            2          Male   

                            PrimaryDiagnosisDescription  CBC_Lymphocytes  \
5852  Exanthema subitum [sixth disease] due to human...              1.3   

      CBC_Neutrophils  CBC_Basophils  CBC_Eosinophils  CBC_Hematocrit  \
5852              8.6            0.1              0.1            45.2   

      CBC_Hemoglobin  CBC_MCH  ...  METABOLIC_SODIUM  METABOLIC_TOTAL_PROTEIN  \
5852            18.1     37.8  ...             147.8                      9.4   

      URINALYSIS_PH  URINALYSIS_RED_BLOOD_CELLS  URINALYSIS_WHITE_BLOOD_CELLS  \
5852            7.4                         3.2                           4.5   

      Smoke   BMI  Alcohol  Exercise  Age  
5852     No  28.3      Yes        No   30  

[1 rows x 36 columns]

Sample Prediction:
3


  print(int(sample_prediction.round()))


In [7]:
sample_input = X_test.iloc[[898]]
sample_input_preprocessed = preprocess_sample(sample_input, preprocessor)
sample_prediction = xgb_best.predict(sample_input_preprocessed)

print("\nSample Input:")
print(sample_input)
print("\nSample Prediction:")
print(int(sample_prediction.round()))


Sample Input:
       AdmissionID PatientGender           PrimaryDiagnosisDescription  \
15791            2        Female  Erythema infectiosum [fifth disease]   

       CBC_Lymphocytes  CBC_Neutrophils  CBC_Basophils  CBC_Eosinophils  \
15791              4.0              7.7            0.1              0.2   

       CBC_Hematocrit  CBC_Hemoglobin  CBC_MCH  ...  METABOLIC_SODIUM  \
15791            43.7            17.7     35.0  ...             145.8   

       METABOLIC_TOTAL_PROTEIN  URINALYSIS_PH  URINALYSIS_RED_BLOOD_CELLS  \
15791                      6.2            7.1                         1.2   

       URINALYSIS_WHITE_BLOOD_CELLS  Smoke   BMI  Alcohol  Exercise  Age  
15791                           3.9    Yes  18.7      Yes        No   36  

[1 rows x 36 columns]

Sample Prediction:
2


  print(int(sample_prediction.round()))


In [8]:
if int(sample_prediction.round())==y_test.iloc[898]:
    print("Prediction is correct")
else:
    print("Prediction is incorrect")
print(y_test.iloc[898])

Prediction is correct
2


  if int(sample_prediction.round())==y_test.iloc[898]:


In [11]:
def preprocess_text_column(column):
    return column.apply(preprocess_text)

# Update the text pipeline to use the named function
text_pipeline = imPipeline(steps=[
    ('preprocess', FunctionTransformer(preprocess_text_column)),
    ('tfidf', TfidfVectorizer(max_features=500))
])


In [12]:
# Combine all preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('text', text_pipeline, text_feature),
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

pipeline_best = imPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('model', xgb_best)
])


In [18]:
import pickle

# Fit the pipeline on the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_best.fit(X_train, y_train)

# Save the fitted pipeline as a pickle file
with open('xgb_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline_best, f)

print("Fitted model and pipeline saved successfully.")


Fitted model and pipeline saved successfully.


In [19]:
# Load the pipeline and model from the pickle file
with open('xgb_pipeline.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)

print("Fitted model and pipeline loaded successfully.")

Fitted model and pipeline loaded successfully.


In [22]:
# All columns used in training
required_columns = X_train.columns.tolist()
print(required_columns)

['AdmissionID', 'PatientGender', 'PrimaryDiagnosisDescription', 'CBC_Lymphocytes', 'CBC_Neutrophils', 'CBC_Basophils', 'CBC_Eosinophils', 'CBC_Hematocrit', 'CBC_Hemoglobin', 'CBC_MCH', 'CBC_MCHC', 'CBC_Monocytes', 'CBC_PLATELET_COUNT', 'CBC_RDW', 'CBC_RED_BLOOD_CELL_COUNT', 'CBC_WHITE_BLOOD_CELL_COUNT', 'METABOLIC_ALBUMIN', 'METABOLIC_ALT_SGPT', 'METABOLIC_BILI_TOTAL', 'METABOLIC_BUN', 'METABOLIC_CALCIUM', 'METABOLIC_CARBON_DIOXIDE', 'METABOLIC_CHLORIDE', 'METABOLIC_CREATININE', 'METABOLIC_GLUCOSE', 'METABOLIC_POTASSIUM', 'METABOLIC_SODIUM', 'METABOLIC_TOTAL_PROTEIN', 'URINALYSIS_PH', 'URINALYSIS_RED_BLOOD_CELLS', 'URINALYSIS_WHITE_BLOOD_CELLS', 'Smoke', 'BMI', 'Alcohol', 'Exercise', 'Age']


In [23]:
# Provided sample data (single row)
sample_data = {
    'AdmissionID': [5],
    'PatientGender': ['Female'],
    'PrimaryDiagnosisDescription': ['Type 2 diabetes mellitus with hypoglycemia'],
    'CBC_Lymphocytes': [2.7],
    'CBC_Neutrophils': [7.0],
    'CBC_Basophils': [0.2],
    'CBC_Eosinophils': [0.2],
    'CBC_Hematocrit': [53.0],
    'CBC_Hemoglobin': [10.1],
    'CBC_MCH': [31.7],
    'CBC_MCHC': [33.9],
    'CBC_Monocytes': [0.2],
    'CBC_PLATELET_COUNT': [167.0],
    'CBC_RDW': [13.9],
    'CBC_RED_BLOOD_CELL_COUNT': [6.3],
    'CBC_WHITE_BLOOD_CELL_COUNT': [7.8],
    'METABOLIC_ALBUMIN': [3.0],
    'METABOLIC_ALT_SGPT': [63.7],
    'METABOLIC_BILI_TOTAL': [0.8],
    'METABOLIC_BUN': [18.0],
    'METABOLIC_CALCIUM': [8.4],
    'METABOLIC_CARBON_DIOXIDE': [34.7],
    'METABOLIC_CHLORIDE': [109.7],
    'METABOLIC_CREATININE': [0.9],
    'METABOLIC_GLUCOSE': [84.6],
    'METABOLIC_POTASSIUM': [5.7],
    'METABOLIC_SODIUM': [134.5],
    'METABOLIC_TOTAL_PROTEIN': [8.3],
    'URINALYSIS_PH': [5.0],
    'URINALYSIS_RED_BLOOD_CELLS': [0.4],
    'URINALYSIS_WHITE_BLOOD_CELLS': [5.0],
    'Smoke': ['Yes'],
    'BMI': [21.1],
    'Alcohol': ['Yes'],
    'Exercise': ['Yes'],
    'Age': [48]
}

# Ensure all required columns are present in the input
required_columns = ['AdmissionID', 'PatientGender', 'PrimaryDiagnosisDescription', 'CBC_Lymphocytes', 'CBC_Neutrophils', 
                    'CBC_Basophils', 'CBC_Eosinophils', 'CBC_Hematocrit', 'CBC_Hemoglobin', 'CBC_MCH', 'CBC_MCHC', 
                    'CBC_Monocytes', 'CBC_PLATELET_COUNT', 'CBC_RDW', 'CBC_RED_BLOOD_CELL_COUNT', 
                    'CBC_WHITE_BLOOD_CELL_COUNT', 'METABOLIC_ALBUMIN', 'METABOLIC_ALT_SGPT', 'METABOLIC_BILI_TOTAL', 
                    'METABOLIC_BUN', 'METABOLIC_CALCIUM', 'METABOLIC_CARBON_DIOXIDE', 'METABOLIC_CHLORIDE', 
                    'METABOLIC_CREATININE', 'METABOLIC_GLUCOSE', 'METABOLIC_POTASSIUM', 'METABOLIC_SODIUM', 
                    'METABOLIC_TOTAL_PROTEIN', 'URINALYSIS_PH', 'URINALYSIS_RED_BLOOD_CELLS', 
                    'URINALYSIS_WHITE_BLOOD_CELLS', 'Smoke', 'BMI', 'Alcohol', 'Exercise', 'Age']

# Convert the sample data to a DataFrame
user_input_df = pd.DataFrame(sample_data)

# Add missing columns with NaN values
for col in required_columns:
    if col not in user_input_df.columns:
        user_input_df[col] = np.nan

# Ensure the columns are in the correct order
user_input_df = user_input_df[required_columns]

# Output the constructed user input DataFrame
print("User Input DataFrame:")
print(user_input_df)

User Input DataFrame:
   AdmissionID PatientGender                 PrimaryDiagnosisDescription  \
0            5        Female  Type 2 diabetes mellitus with hypoglycemia   

   CBC_Lymphocytes  CBC_Neutrophils  CBC_Basophils  CBC_Eosinophils  \
0              2.7              7.0            0.2              0.2   

   CBC_Hematocrit  CBC_Hemoglobin  CBC_MCH  ...  METABOLIC_SODIUM  \
0            53.0            10.1     31.7  ...             134.5   

   METABOLIC_TOTAL_PROTEIN  URINALYSIS_PH  URINALYSIS_RED_BLOOD_CELLS  \
0                      8.3            5.0                         0.4   

   URINALYSIS_WHITE_BLOOD_CELLS  Smoke   BMI  Alcohol  Exercise  Age  
0                           5.0    Yes  21.1      Yes       Yes   48  

[1 rows x 36 columns]


In [14]:
import os
os.getcwd()

'c:\\Users\\HP\\Downloads'

In [24]:
# Predict the risk score using the loaded and fitted pipeline
user_prediction = loaded_pipeline.predict(user_input_df)

# Output the prediction
print("\nPredicted Risk Score:")
print(int(user_prediction.round()))


Predicted Risk Score:
4


  print(int(user_prediction.round()))
