In [None]:
def get_token(account_name, account_password, secret_key):
    url = "https://api.caresmartz360.com/authorization/signin"
    payload = {
        "accountName": account_name,
        "accountPassword": account_password,
        "secretKey": secret_key
    }

    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        token = response.json().get("content", {}).get("token")
        if not token:
            raise ValueError("Token not found in response")
        return token
    except requests.exceptions.HTTPError as http_err:
        if response.status_code == 401:
            raise RuntimeError("Unauthorized: Invalid account credentials. Please verify your account details.") from http_err
        else:
            raise RuntimeError(f"HTTP error occurred while obtaining token: {http_err}") from http_err
    except requests.exceptions.RequestException as req_err:
        raise RuntimeError(f"Request error occurred while obtaining token: {req_err}") from req_err
    except ValueError as json_err:
        raise RuntimeError(f"JSON decode error or missing token: {json_err}") from json_err

In [None]:
def fetch_churn_data(token, agency_id):
    url = "https://api.caresmartz360.com/cs360/caregiverchurn_v1"
    headers = {
        "agencyId": f"{agency_id}",
        "Authorization": f"Bearer {token}"
    }
    body = [
        {
            "Key": "lastModifiedDate",
            "Value": None
        }
    ]

    try:
        response = requests.post(url, headers=headers, json=body)
        response.raise_for_status()  # Raises HTTPError for bad responses (4xx and 5xx)
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        if response.status_code == 401:
            raise RuntimeError("Unauthorized: Invalid token or token expired. Please check your credentials.") from http_err
        elif response.status_code == 504:
            raise RuntimeError("Gateway Timeout: The server did not respond in time. Please try again later.") from http_err
        else:
            raise RuntimeError(f"HTTP error occurred: {http_err}") from http_err
    except requests.exceptions.RequestException as req_err:
        raise RuntimeError(f"Request error occurred: {req_err}") from req_err
    except ValueError as json_err:
        raise RuntimeError(f"JSON decode error occurred: {json_err}") from json_err

def fetch_caregiver_data(token, agency_id):
    url = "https://api.caresmartz360.com/cs360/caregivers_v1"
    headers = {
        "agencyId": f"{agency_id}",
        "Authorization": f"Bearer {token}"
    }
    body = [
        {
            "Key": "lastModifiedDate",
            "Value": None
        }
    ]

    try:
        response = requests.post(url, headers=headers, json=body)
        response.raise_for_status()  # Raises HTTPError for bad responses (4xx and 5xx)
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        if response.status_code == 401:
            raise RuntimeError("Unauthorized: Invalid token or token expired. Please check your credentials.") from http_err
        elif response.status_code == 504:
            raise RuntimeError("Gateway Timeout: The server did not respond in time. Please try again later.") from http_err
        else:
            raise RuntimeError(f"HTTP error occurred: {http_err}") from http_err
    except requests.exceptions.RequestException as req_err:
        raise RuntimeError(f"Request error occurred: {req_err}") from req_err
    except ValueError as json_err:
        raise RuntimeError(f"JSON decode error occurred: {json_err}") from json_err


In [None]:
def get_data(agency_id, account_name, account_password, secret_key):
    token = get_token(account_name, account_password, secret_key)
    if token:
        try:
            churn_data = fetch_churn_data(token, agency_id)
            caregiver_data = fetch_caregiver_data(token, agency_id)
            if churn_data is None or caregiver_data is None:
                raise ValueError("Failed to fetch one or more data sets.")
            return churn_data, caregiver_data
        except ValueError as val_err:
            raise RuntimeError(f"Value error: {val_err}") from val_err
        except Exception as ex:
            raise RuntimeError(f"An unexpected error occurred: {ex}") from ex
    else:
        raise ValueError("Failed to obtain token, Authorization Failed.")



In [None]:
import requests
def convert_to_df(data):
    """
    Converts the API response data into a Pandas DataFrame.

    Checks if the response has the expected structure and handles cases where it doesn't.
    """
    if isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict) and 'content' in data[0]:
        content_data = data[0]['content']
        content_list = json.loads(content_data)
        return pd.DataFrame(content_list)
    else:

        print("Data does not have expected structure. Returning empty DataFrame.")
        return pd.DataFrame()


In [None]:
import json
import pandas as pd
import numpy as np
agency_ids = [141,188,110,126,148,203,130,230,297]
all_merged_dfs = []
for agency_id in agency_ids:

  churn_data, caregiver_data = get_data(agency_id, "CS_Caresmartz_AI", "Caresmartz@AI!", "Q2FyZXNtYXJ0ejM2MF9DU19DYXJlc21hcnR6X0FJ")


  churn_df_new = convert_to_df(churn_data)


  caregiver_df_new = convert_to_df(caregiver_data)

  if churn_df_new.empty or caregiver_df_new.empty:
      print(f"Skipping merge for agency_id {agency_id} due to empty DataFrame(s).")
      continue


  merged_df = pd.merge(churn_df_new, caregiver_df_new, on='CaregiverId', how='inner')

  all_merged_dfs.append(merged_df)

Data does not have expected structure. Returning empty DataFrame.
Skipping merge for agency_id 110 due to empty DataFrame(s).
Data does not have expected structure. Returning empty DataFrame.
Skipping merge for agency_id 148 due to empty DataFrame(s).
Data does not have expected structure. Returning empty DataFrame.
Skipping merge for agency_id 203 due to empty DataFrame(s).


In [None]:
final_merged_df = pd.concat(all_merged_dfs, ignore_index=True)

In [None]:
final_merged_df.to_csv("final_df.csv")

In [None]:
final_merged_df.shape

(299655, 58)

In [None]:
import sys
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline


class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.unwanted_features = [
            'Termination Date', 'ScheduleSlotId', 'Address2', 'Caregiver Restriction',
            'Caregiver Attributes', 'Skille Type', 'Marital Status',
            'EducationalBackGround', 'Date Of Separation', 'SeparationReason',
            'SeparationNotes'
        ]
        self.race_mode = None

    def fit(self, X, y=None):
        if 'Race' in X.columns and X['Race'].isnull().mean() * 100 < 40:
            self.race_mode = X['Race'].mode()[0]
        return self

    def transform(self, X, y=None):
        X = X.drop(self.unwanted_features, axis=1, errors='ignore')
        if 'Race' in X.columns:
            X['Race'] = X['Race'].fillna(self.race_mode)
        return X


class FillNaWithModeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        categorical_columns = X.select_dtypes(include=['object', 'category']).columns
        for column in categorical_columns:
            if X[column].isnull().mean() * 100 <= 40:
                mode_value = X[column].mode()[0]
                X[column] = X[column].fillna(mode_value)
        return X


class FillNumericalBasedOnSkew(BaseEstimator, TransformerMixin):
    def __init__(self, skew_threshold=0.5):
        self.skew_threshold = skew_threshold
        self.fill_values_ = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
        for col in numeric_cols:
            if X[col].isnull().mean() * 100 <= 40 and X[col].isnull().any():
                skewness = X[col].skew()
                self.fill_values_[col] = X[col].median() if abs(skewness) > self.skew_threshold else X[col].mean()
        return self

    def transform(self, X):
        for col, fill_value in self.fill_values_.items():
            X[col] = X[col].fillna(fill_value)
        return X


class FeatureCalculator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoders = {
            'Gender': LabelEncoder(),
            'Race': LabelEncoder()
        }

    def fit(self, X, y=None):
        if 'Gender' in X.columns:
            self.label_encoders['Gender'].fit(X['Gender'].dropna())
        if 'Race' in X.columns:
            self.label_encoders['Race'].fit(X['Race'].dropna())
        return self

    def transform(self, X, y=None):
        df = X.copy()
        # Convert to datetime
        for col in ['Date Of Birth', 'Date Of Joining']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors='coerce')

        current_date = pd.to_datetime('today')
        df['Age'] = current_date.year - df['Date Of Birth'].dt.year
        df['Tenure_days'] = (current_date - df['Date Of Joining']).dt.days

        if 'IsCaregiverTerminated' in df.columns:
            df['IsCaregiverTerminated'] = np.where(df['IsCaregiverTerminated'] == 'Yes', 1, 0)

        if 'Gender' in df.columns:
            df['Gender'] = self.label_encoders['Gender'].transform(df['Gender'])

        if 'Race' in df.columns:
            df['Race'] = df['Race'].fillna('Unknown')
            df['Race'] = self.label_encoders['Race'].transform(df['Race'])

        # Final selected columns
        required_columns = [
            'CaregiverId', 'Age', 'Gender', 'Race',
            'Tenure_days', 'Pay Rate','Pay Unit','Payroll UnitsWithoutOT','Payroll OTUnits','Total Payroll Amount','IsCaregiverTerminated'
        ]

        return df[required_columns]


class RemoveOutliersTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column='Tenure_days'):
        self.column = column
        self.lower_bound = None
        self.upper_bound = None

    def fit(self, X, y=None):
        Q1 = X[self.column].quantile(0.25)
        Q3 = X[self.column].quantile(0.75)
        IQR = Q3 - Q1
        self.lower_bound = Q1 - 1.5 * IQR
        self.upper_bound = Q3 + 1.5 * IQR
        return self

    def transform(self, X, y=None):
        return X[(X[self.column] >= self.lower_bound) & (X[self.column] <= self.upper_bound)]



class CaregiverIdEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.id_map = {}

    def fit(self, X, y=None):
        if 'CaregiverId' in X.columns:
            unique_ids = X['CaregiverId'].unique()
            self.id_map = {id_val: idx for idx, id_val in enumerate(unique_ids)}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        if 'CaregiverId' in X.columns and self.id_map:
            X['CaregiverId'] = X['CaregiverId'].map(self.id_map)
        return X



In [None]:
from sklearn.pipeline import Pipeline

# Define pipeline steps
full_preprocessing_pipeline = Pipeline([
    ('custom_transformer', CustomTransformer()),
    ('fill_na_mode', FillNaWithModeTransformer()),
    ('fill_numerical', FillNumericalBasedOnSkew()),
    ('feature_calculator', FeatureCalculator()),
    ('remove_outliers', RemoveOutliersTransformer(column='Tenure_days')),
    ('caregiver_id_encoder', CaregiverIdEncoder())
])




In [None]:
import traceback

def preprocess_dataframe(df):
    try:
        processed_df = full_preprocessing_pipeline.fit_transform(df)
        return processed_df
    except Exception as e:
        print("Pipeline execution failed.")
        traceback.print_exc()
        return df


In [None]:
processed_df = preprocess_dataframe(final_merged_df)

In [None]:
processed_df.shape

(285120, 11)

In [None]:
processed_df.drop_duplicates(inplace=True)

In [None]:
processed_df.shape

(65977, 11)

In [None]:
processed_df['Pay Unit'].value_counts()

Unnamed: 0_level_0,count
Pay Unit,Unnamed: 1_level_1
Hourly,65542
Visit,435


In [None]:
processed_df['Pay Unit'] = processed_df['Pay Unit'].map({'Hourly': 0, 'Visit': 1})

In [None]:
processed_df['Pay Rate'] = processed_df['Pay Rate'].astype(float).astype(int)

In [None]:
processed_df['Pay Unit'] = processed_df['Pay Unit'].astype(float).astype(int)

In [None]:
processed_df['Payroll OTUnits'] = processed_df['Payroll OTUnits'].astype(float).astype(int)

In [None]:
processed_df['Payroll UnitsWithoutOT'] = processed_df['Payroll UnitsWithoutOT'].astype(float).astype(int)

In [None]:
processed_df['Total Payroll Amount'] = processed_df['Total Payroll Amount'].astype(float).astype(int)

In [None]:
processed_df.drop(columns=['CaregiverId'], inplace=True)

In [None]:
processed_df.dtypes

Unnamed: 0,0
Age,int32
Gender,int64
Race,int64
Tenure_days,int64
Pay Rate,int64
Pay Unit,int64
Payroll UnitsWithoutOT,int64
Payroll OTUnits,int64
Total Payroll Amount,int64
IsCaregiverTerminated,int64


In [None]:
processed_df.shape

(65977, 10)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint


# Feature and target split
X = processed_df.drop(columns=['IsCaregiverTerminated'])
y = processed_df['IsCaregiverTerminated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter space
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 5),
    'reg_alpha': uniform(0, 5),
    'reg_lambda': uniform(0, 5)
}

# XGBoost model
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Randomized search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Train
random_search.fit(X_train, y_train)

# Best estimator
best_model = random_search.best_estimator_
print("Best Parameters:\n", random_search.best_params_)

# Evaluate
y_pred = best_model.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))

# Feature importance
feature_imp_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_imp_df)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters:
 {'colsample_bytree': np.float64(0.9942601816442402), 'gamma': np.float64(1.2102763575575022), 'learning_rate': np.float64(0.21164066422176356), 'max_depth': 13, 'n_estimators': 172, 'reg_alpha': np.float64(1.2107996913871295), 'reg_lambda': np.float64(4.015698781899479), 'subsample': np.float64(0.7881202537784153)}
Accuracy on test set: 0.9781752046074568
                  Feature  Importance
4                Pay Rate    0.186403
3             Tenure_days    0.175462
2                    Race    0.168933
0                     Age    0.162263
1                  Gender    0.107792
8    Total Payroll Amount    0.060889
7         Payroll OTUnits    0.050717
6  Payroll UnitsWithoutOT    0.047338
5                Pay Unit    0.040204


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint, uniform


X = processed_df.drop(columns=['IsCaregiverTerminated'])
y = processed_df['IsCaregiverTerminated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'num_leaves': randint(20, 150)
}

lgbm = LGBMClassifier(random_state=42, class_weight='balanced')

lgbm_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

lgbm_search.fit(X_train, y_train)
best_lgbm = lgbm_search.best_estimator_
print("LightGBM Accuracy:", accuracy_score(y_test, best_lgbm.predict(X_test)))

lgbm_feat = pd.DataFrame({'Feature': X.columns, 'Importance': best_lgbm.feature_importances_}).sort_values(by='Importance', ascending=False)
print(lgbm_feat)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Number of positive: 30887, number of negative: 21894
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 767
[LightGBM] [Info] Number of data points in the train set: 52781, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Accuracy: 0.9912094574113368
                  Feature  Importance
3             Tenure_days        8432
0                     Age        6340
8    Total Payroll Amount        3243
4                Pay Rate        3127
2                    Race        1262
6  Payroll UnitsWithoutOT        1101
1                  Gender         938
7         Payroll OTUnits         276
5                Pay Unit          23


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

X = processed_df.drop(columns=['IsCaregiverTerminated'])
y = processed_df['IsCaregiverTerminated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_dist = {
    'iterations': randint(50, 200),
    'depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'l2_leaf_reg': uniform(1, 10)
}

cat_model = CatBoostClassifier(verbose=0, random_state=42, auto_class_weights='Balanced')

cat_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

cat_search.fit(X_train, y_train)
best_cat = cat_search.best_estimator_
print("CatBoost Accuracy:", accuracy_score(y_test, best_cat.predict(X_test)))

cat_feat = pd.DataFrame({'Feature': X.columns, 'Importance': best_cat.get_feature_importance()}).sort_values(by='Importance', ascending=False)
print(cat_feat)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
CatBoost Accuracy: 0.9529402849348287
                  Feature  Importance
3             Tenure_days   37.961371
0                     Age   30.383730
2                    Race   14.957932
4                Pay Rate   10.900931
1                  Gender    4.043811
8    Total Payroll Amount    0.870562
6  Payroll UnitsWithoutOT    0.809901
7         Payroll OTUnits    0.047650
5                Pay Unit    0.024113
