In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics import make_scorer, accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier, Pool

In [None]:
import pandas as pd
df = pd.read_excel('bmc_cases.xlsb', engine='pyxlsb')

In [None]:
len(df) - df.isna().sum()

In [None]:
data = df[['Ward','Date','Age','Gender','Symptomatic?','Traceable?','Patient Location','Patient Status (Central)','Occupation','Patient Type (Index / Contact)']]

In [None]:
data['Symptomatic?'].value_counts()

In [None]:
allowed_params = {'Symptomatic?':(['Yes','No'],'Unknown'),
                  'Traceable?':(['Yes','No'],'Unknown'),
                    'Patient Location':(['Building','Slum / Chawl'],'Unknown'),
                    'Occupation':(['At-home worker','Health worker','Other essential services','Municipal Worker','Police','RETIRED','House Wife'],'Others'),
                'Patient Type (Index / Contact)':(['Index','Contact'],'Unknown'),
                  'Gender':(['M','F'],'Others')
                 }
                    

In [None]:
def get_val(column_val,allowed):
    if column_val in allowed[0]:
        return column_val
    return allowed[1]

In [None]:
for feature in allowed_params:
    data[feature] = data.apply(lambda row : get_val(row[feature],allowed_params[feature]),axis = 1) 

In [None]:
data

In [None]:
df = data[(data['Patient Status (Central)'] == 'Recovered') | (data['Patient Status (Central)'] == 'Dead')]

In [None]:
y = df['Patient Status (Central)']
X = df.drop(['Patient Status (Central)'],axis = 1)

In [None]:
print(X['Symptomatic?'].value_counts())

In [None]:
print(X.dtypes)
categorical_features_indices = np.where((X.dtypes == object))[0]
categorical_features_indices

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
from collections import Counter
y_count = Counter(y)
print(y_count)
W = {}
W[0]= len(y)/(2*y_count[0])
W[1] = len(y)/(2*y_count[1])
print(W)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [None]:
X_train_pool = Pool(X_train,label = y_train,cat_features=categorical_features_indices)
clf = CatBoostClassifier()
params = {'iterations': [500],
          'depth': [6, 8 , 10],
          'cat_features':[categorical_features_indices],
          'loss_function': ['Logloss'],
          'l2_leaf_reg': np.linspace(1, 6, 4),
          'leaf_estimation_iterations': [10],
          'verbose':[500],
          'class_weights' : [W]
         }


In [None]:
scorer = make_scorer(roc_auc_score)
clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=5)

In [None]:
clf_grid.fit(
    X_train, y_train,
);

In [None]:
best_param = clf_grid.best_params_
best_param

In [None]:
model = CatBoostClassifier(iterations=1000,
                           loss_function=best_param['loss_function'],
                           depth=best_param['depth'],
                           l2_leaf_reg=best_param['l2_leaf_reg'],
                           eval_metric='Accuracy',
                           leaf_estimation_iterations=10,
                           use_best_model=True,
                           logging_level='Silent'
                          );

In [None]:
model.fit(X_train_pool, eval_set=(X_val,y_val))

In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)

In [None]:
W

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,plot_roc_curve
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
df = pd.DataFrame({"Features":X.columns,"Importance":model.get_feature_importance(verbose=True)})
df.sort_values(by = 'Importance',ascending = False)

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

In [None]:
import matplotlib as mpl
mpl.rcParams.update({'font.size': 22})

In [None]:
import matplotlib.pyplot as plt
fig,ax = plt.subplots(2,1,figsize=[10,20])
plot_roc_curve(model,X_test,y_test,ax =ax.flat[0])
plot_precision_recall_curve(model,X_test,y_test,ax =ax.flat[1])

In [None]:
model.plot_tree(0)

In [None]:
tree_dict = {}
tree_data = X_test[['Age','Symptomatic?','label']]
age_bins = [0,29.5,46.5,62.5,150]
bins = [(age_bins[i],age_bins[i+1]) for i in range(len(age_bins)-1)]
sym_classes = list(tree_data['Symptomatic?'].value_counts().keys())
test_g = tree_data.groupby(pd.cut(tree_data['Age'],bins =age_bins ))
print(bins)
print(sym_classes)
for age_bin,df_group in test_g:
    tree_dict[(age_bin.left,age_bin.right)] = {}
    test_g2 = df_group.groupby(df_group["Symptomatic?"])
    for sym_bin,df_group2 in test_g2:
        tree_dict[(age_bin.left,age_bin.right)][sym_bin] = df_group2['label'].value_counts()
for b in bins:
    for s in sym_classes:
        print(b,s)
        print(tree_dict[b][s])
        print('-'*10)