In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [4]:
data = pd.read_csv("clean_ukr.csv")

if 'X' in data.columns:
    data = data.drop(columns=['X'])

# Create new variables based on conditions
data['hiv'] = data['takes_art'].notna().astype(int)
data['Cot'] = data['Cotrimoxazole.treatment'].notna().astype(int)
data['new_prev'] = data['new_prev'].fillna('New')
data['prev_treatment'] = (data['new_prev'] == 'Previously treated').astype(int)

# Convert columns to appropriate types
categorical_cols = [
    'Sex', 'Region', 'DST_R', 'Localization', 'hiv_def', 
    'hiv', 'Cot', 'Alcohol.abuse', 'Injecting.drug.user', 'Homeless', 
    'Unemployed', 'healthcare_worker', 'Prisoner', 'migrant_refugee', 
    'prev_treatment', 'Bactec', 'LJ', 'GeneXpert', 'DST_E', 'DST_Z', 'DST_S', 
    'DST_H', 'DST_Am', 'DST_Cm', 'DST_LFX', 'DST_MFX', 'DST_PAS', 'DST_Km', 
    'DST_Ofx', 'DST_Et', 'DST_Lzd', 'DST_Cs'
]
numerical_cols = ['imputed_weight', 'elapsed_time', 'Age']

# Convert categorical variables to category type
for col in categorical_cols:
    data[col] = data[col].astype('category')

# Convert numerical columns to numeric type
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Set the response variable
data['dropout'] = (data['final_outcome_group'] == 'Treatment discontinuation').astype(int)

# Check the transformation
print(data[categorical_cols].head())
print(data[numerical_cols].head())
print(data['dropout'].value_counts())


      Sex          Region DST_R Localization hiv_def hiv Cot Alcohol.abuse  \
0    Male          Odessa     1            1       0   0   0             0   
1    Male          Odessa     1            1       0   0   0             0   
2  Female         Poltava     2            1       0   0   0             0   
3  Female         Poltava     2            1       0   0   0             0   
4  Female  Dnipropetrovsk     1            1       0   0   0             0   

  Injecting.drug.user Homeless  ... DST_Am DST_Cm DST_LFX DST_MFX DST_PAS  \
0                   0        0  ...      0      2       0       0       2   
1                   0        0  ...      0      2       0       0       2   
2                   0        0  ...      0      0       0       0       0   
3                   0        0  ...      0      0       0       0       0   
4                   0        0  ...      0      0       0       0       2   

  DST_Km DST_Ofx DST_Et DST_Lzd DST_Cs  
0      2       2      0    

In [5]:
data = data[data['final_outcome_group'] != "Transfer"]

print(data['final_outcome_group'].unique())

['Cure or Treatment Completion' 'Treatment discontinuation'
 'Death or Palliative Care' 'Treatment in process' 'Failure']


## CART Analysis

In [22]:
# Define features and target variable
X = data[categorical_cols + numerical_cols]
y = data['dropout']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Initialize and train the decision tree classifier
clf = DecisionTreeClassifier(random_state=123)
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {metrics.classification_report(y_test, y_pred)}")


Accuracy: 0.9162820255535488
Confusion Matrix:
 [[27293   318]
 [ 2185   102]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96     27611
           1       0.24      0.04      0.08      2287

    accuracy                           0.92     29898
   macro avg       0.58      0.52      0.52     29898
weighted avg       0.87      0.92      0.89     29898



### GridSearchCV Parameter Tuning for Decision Tree

In [23]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}

# Initialize the Decision Tree model
clf = DecisionTreeClassifier(random_state=123)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Predict using the best model
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {metrics.classification_report(y_test, y_pred)}")


Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2}
Best Score: 0.9241491763525378
Accuracy: 0.923506589069503
Confusion Matrix:
 [[27611     0]
 [ 2287     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     27611
           1       0.00      0.00      0.00      2287

    accuracy                           0.92     29898
   macro avg       0.46      0.50      0.48     29898
weighted avg       0.85      0.92      0.89     29898



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## HistGradientBoostingClassifier

In [24]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Define features and target variable
X = data[categorical_cols + numerical_cols]
y = data['dropout']

# Convert categorical variables to category codes, preserving the natural categorical nature
for col in categorical_cols:
    X[col] = X[col].cat.codes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Initialize and train the HistGradientBoostingClassifier
hist_clf = HistGradientBoostingClassifier(random_state=123)

# Fit the model
hist_clf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = hist_clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {metrics.classification_report(y_test, y_pred)}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].cat.codes


Accuracy: 0.9245100006689411
Confusion Matrix:
 [[27510   101]
 [ 2156   131]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     27611
           1       0.56      0.06      0.10      2287

    accuracy                           0.92     29898
   macro avg       0.75      0.53      0.53     29898
weighted avg       0.90      0.92      0.90     29898



### GridSearchCV Parameter Tuning for HistGradientBoostingClassifier

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_leaf_nodes': [10, 20, 50], # From 30
    'min_samples_leaf': [1, 5, 30] # From 10
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=HistGradientBoostingClassifier(random_state=123),
                           param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Predict using the best model
best_hist_clf = grid_search.best_estimator_
y_pred = best_hist_clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {metrics.classification_report(y_test, y_pred)}")


Best Parameters: {'learning_rate': 0.1, 'max_iter': 200, 'max_leaf_nodes': 20, 'min_samples_leaf': 30}
Best Score: 0.9251358809264989
Accuracy: 0.9251454946819185
Confusion Matrix:
 [[27544    67]
 [ 2171   116]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     27611
           1       0.63      0.05      0.09      2287

    accuracy                           0.93     29898
   macro avg       0.78      0.52      0.53     29898
weighted avg       0.90      0.93      0.89     29898



## Important Features from LASSO on Both Models

In [1]:
categorical_cols2 = [
    'Sex', 'Region', 'DST_R', 'Localization', 'hiv_def', 'hiv', 'Cot', 
    'Alcohol.abuse', 'Injecting.drug.user', 'Homeless', 'Unemployed', 
    'healthcare_worker', 'Prisoner', 'migrant_refugee', 'prev_treatment', 
    'Bactec', 'LJ', 'GeneXpert', 'DST_E', 'DST_Z', 'DST_S', 'DST_H', 
    'DST_Am', 'DST_Cm', 'DST_LFX', 'DST_MFX', 'DST_PAS', 'DST_Km', 
    'DST_Ofx', 'DST_Et', 'DST_Lzd', 'DST_Cs'
]

numerical_cols2 = ['imputed_weight', 'Age']


### CART LASSO Fit

In [27]:
# Define features and target variable
X = data[categorical_cols2 + numerical_cols]
y = data['dropout']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Initialize the Decision Tree Classifier with the best parameters
clf = DecisionTreeClassifier(
    max_depth=None, 
    max_features='sqrt', 
    min_samples_leaf=10, 
    min_samples_split=2, 
    random_state=123
)

# Fit the model
clf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {metrics.classification_report(y_test, y_pred)}")

Accuracy: 0.923506589069503
Confusion Matrix:
 [[27611     0]
 [ 2287     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     27611
           1       0.00      0.00      0.00      2287

    accuracy                           0.92     29898
   macro avg       0.46      0.50      0.48     29898
weighted avg       0.85      0.92      0.89     29898



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 

### HistGradientBoostingClassifier LASSO Fit

In [7]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Define features and target variable
X = data[categorical_cols2 + numerical_cols2]
y = data['dropout']

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Initialize the HistGradientBoostingClassifier with the best parameters
clf = HistGradientBoostingClassifier(
    learning_rate=0.1, 
    max_iter=200, 
    max_leaf_nodes=20, 
    min_samples_leaf=30,
    random_state=123
)

# Fit the model
clf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n {metrics.classification_report(y_test, y_pred)}")

Accuracy: 0.9234731420161884
Confusion Matrix:
 [[27609     2]
 [ 2286     1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     27611
           1       0.33      0.00      0.00      2287

    accuracy                           0.92     29898
   macro avg       0.63      0.50      0.48     29898
weighted avg       0.88      0.92      0.89     29898

