In [None]:
import numpy as np
import pandas as pd

In [None]:
import pandas as pd
df = pd.read_excel('bmc_cases.xlsb', engine='pyxlsb')

In [None]:
df['Date']

In [None]:
len(df) - df.isna().sum()

In [None]:
data = df[['Ward','Date','Age','Gender','Symptomatic?','Traceable?','Patient Location','Patient Status (Central)','Occupation','Patient Type (Index / Contact)']]

In [None]:
data.columns

In [None]:
allowed_params = {'Symptomatic?':(['Yes','No'],'Unknown'),
                  'Traceable?':(['Yes','No'],'Unknown'),
                    'Patient Location':(['Building','Slum / Chawl'],'Unknown'),
                    'Occupation':(['At-home worker','Health worker','Other essential services','Municipal Worker','Police','RETIRED','House Wife'],'Others'),
                'Patient Type (Index / Contact)':(['Index','Contact'],'Unknown')}
                    

In [None]:
def get_val(column_val,allowed):
    if column_val in allowed[0]:
        return column_val
    return allowed[1]

In [None]:
for feature in allowed_params:
    data[feature] = data.apply(lambda row : get_val(row[feature],allowed_params[feature]),axis = 1) 

In [None]:
data

In [None]:
df = data[(data['Patient Status (Central)'] == 'Recovered') | (data['Patient Status (Central)'] == 'Dead')]

In [None]:
y = df['Patient Status (Central)']
X = df.drop(['Patient Status (Central)'],axis = 1)

In [None]:
print(X.dtypes)
categorical_features_indices = np.where(X.dtypes != np.float)[0]

In [None]:
y_count = y.value_counts()
W = {}
W['Recovered']= len(y)/(2*y_count['Recovered'])
W['Dead'] = len(y)/(2*y_count['Dead'])
print(W)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [None]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

In [None]:
model = CatBoostClassifier(
    class_weights=W,
    custom_loss=['Accuracy'],
    logging_level='Silent'
)

In [None]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_val, y_val),
    plot=True
);

In [None]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,plot_roc_curve
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
df = pd.DataFrame({"Features":X.columns,"Importance":model.get_feature_importance(verbose=True)})
df.sort_values(by = 'Importance',ascending = False)

In [None]:
# model.plot_tree(0, pool=None)

In [None]:
import matplotlib as mpl
mpl.rcParams.update({'font.size': 22})

In [None]:
import matplotlib.pyplot as plt
fig,ax = plt.subplots(2,1,figsize=[10,20])
plot_roc_curve(model,X_test,y_test,ax =ax.flat[0])