In [None]:
# Using SHAP to get the feature contirbution to target variable

In [None]:
# Import libraries
import shap
import pandas as pd 
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
from constants import (village_info_col,
                        questionare_col,
                        other_ques_col,
                        option_ques,
                        categorical_ques,
                        regression_feat,
                        categorical_feat,
                        index_col,
                        target_col)

In [None]:
### Load data into dataframe

In [None]:
fans_survey_data = 'covid19-data/Panchayat Survey-FaNS-MGSA(10.05.21).xlsx'
dmks_survey_Dat = 'covid19-data/Panchayat Survey_ DMKS (10.05.21).xlsx'
dmks_sheet_name = fans_sheet_name = 'Raw Data'

In [None]:
fans_df = pd.read_excel(fans_survey_data, sheet_name = fans_sheet_name)

#fans_df = pd.read_excel(dmks_survey_Dat, sheet_name = dmks_sheet_name)
print(len(fans_df))
#fans_df.columns
print(len(fans_df.Identifier.unique()))

list_of_col = fans_df.columns
list_of_col

In [None]:
### Data Cleaning and Data Preparation

In [None]:
# Filter by Repeat No. since data is missing for repeat no other than 1.
fans_df = fans_df[fans_df["Repeat no"] ==1]

#Drop col other than regression_feat, categorical_feat, index_col
query_df = fans_df[[index_col] + regression_feat + categorical_feat ]

# Convert cols in option_ques from nan into 0
for col in categorical_feat:
    query_df[col] = query_df[col].fillna(0)
    

for col in query_df.columns[1:]:
    print(col)
    print(query_df[col].unique())
    print('--------------------------')

In [None]:
### Data Transformation

In [None]:
# Transformation to feed into regressor
answer_map = {'Yes': 1, 'No': 0, 1. : 1, 0.:0}

for col in categorical_feat:
    query_df[col] = query_df[col].map(answer_map)

In [None]:
for col in query_df.columns[1:]:
    print(col)
    print(query_df[col].unique())
    print('--------------------------')

In [None]:
assert (len(query_df[[index_col]]) == len(query_df[index_col].unique()))

In [None]:
query_df.set_index(index_col, inplace=True)

In [None]:
X, y = query_df.drop([target_col], axis=1), query_df[target_col]

In [None]:
X.head(3)

In [None]:
y.head(3)

In [None]:
# Split the data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [None]:
# Random Forest Mpdel
rf_reg = RandomForestRegressor(n_estimators=100)
rf_reg.fit(Xtrain, ytrain)

In [None]:
rf_train = rf_reg.score(Xtrain, ytrain)
rf_cv = cross_val_score(rf_reg, Xtrain, ytrain, cv=5).mean()
rf_test = rf_reg.score(Xtest, ytest)
print('Evaluation of the Random Forest performance\n')
print(f'Training score: {rf_train.round(4)}')
print(f'Cross validation score: {rf_cv.round(4)}')
print(f'Test score: {rf_test.round(4)}')

In [None]:
### SHAP values

In [None]:
# Initialize JavaScript visualization
shap.initjs()

In [None]:
# Create SHAP explainer
explainer = shap.TreeExplainer(rf_reg)
shap_values = explainer.shap_values(X)

In [None]:
# shap force plot for the first prediction
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
# SHAP values for all predictions
shap.force_plot(explainer.expected_value, shap_values, X)

In [None]:
# Effect of a single feature on the shap value,and automatically selected other feature to show dependence 
shap.dependence_plot("Children (0-6 years)", shap_values, X)

In [None]:
# See how every feaure contributes to the model output
shap.summary_plot(shap_values, X)

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")