# Import Library

In [2]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import shap
from scipy.stats import chi2_contingency

# Define Path/Feature-list

In [1]:
SHAP_IMAGE_PATH = './database/'
SHAP_VALUE_PATH = './database/mean_shap_table.csv'
CHI2_TABLE_PATH = './database/chi2_table.csv'

PREPROCESSED_DATA_PATH = './database/data.csv'
LABEL_DATA_PATH = './database/label.csv'
CATEGORICAL_FEATURES = ['ChildrenInHH', 'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner',
                      'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings', 'NonUSTravel', 'OwnsComputer',
                      'HasCreditCard', 'NewCellphoneUser', 'NotNewCellphoneUser', 'OwnsMotorcycle', 'MadeCallToRetentionTeam', 
                      "PrizmCode", "Occupation", "MaritalStatus", "Division", 'CreditRating', 'HandsetPrice', 'Homeownership']

# Define Fetch Function

In [3]:
def calc_shap_values(data, model):
    # SHAP 값 계산
    # data : 모델 학습에 사용한 x 데이터

    explainer = shap.Explainer(model)
    shap_values = explainer.shap_values(data)

    return shap_values

def create_mean_shap_table(data, shap_values, file_path=SHAP_VALUE_PATH):
    # SHAP 값을 데이터프레임으로 변환
    column_name = data.columns
    df_list = []

    df_list.append(pd.DataFrame(shap_values[:5000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[5000:10000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[10000:15000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[15000:20000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[20000:25000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[25000:30000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[30000:35000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[35000:40000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[40000:45000], columns=column_name).transpose().abs())
    df_list.append(pd.DataFrame(shap_values[45000:], columns=column_name).transpose().abs())

    shap_df = pd.concat(df_list, axis=1).mean(axis=1).sort_values(ascending=False)
    shap_df = pd.DataFrame(shap_df)
    shap_df.columns = ['mean(|SHAP value|)']

    shap_df.to_csv(file_path)

def create_shap_images(data, shap_values, feature_names):
    shap.summary_plot(shap_values, data, feature_names=feature_names, max_display=None, show=False)
    plt.savefig(SHAP_IMAGE_PATH+'shap_summary_plot.png', bbox_inches='tight', dpi=300)  # 파일 이름과 포맷 지정
    plt.close()  # 플롯 닫기

    shap.summary_plot(shap_values, data, feature_names=feature_names, max_display=None, show=False, plot_type="bar")
    plt.savefig(SHAP_IMAGE_PATH+'shap_summary_plot_bar.png', bbox_inches='tight', dpi=300)  # 파일 이름과 포맷 지정
    plt.close()  # 플롯 닫기

def fetch_shap_value(data, model):

    shap_values = calc_shap_values(data=data, model=model)

    create_mean_shap_table(data=data, shap_values=shap_values, file_path=SHAP_VALUE_PATH)

    create_shap_images(data, shap_values=shap_values, feature_names=list(data.columns))

def fetch_chi_square(data, label, feature_names, file_path=CHI2_TABLE_PATH):
    # Chi-Square 검정을 통한 변수 중요도 평가
    # data : 학습에 사용한 데이터
    # label : 학습에 사용한 정답
    # feature_name : 학습에 사용한 범주형 변수들 이름 리스트

    chi2_stats = {}
    for feature in feature_names:
        if (feature != 'ServiceArea') & (feature != 'Churn'):
            # 각 변수와 'Survived' 간의 교차표를 생성합니다.
            contingency_table = pd.crosstab(data[feature], label)

            # 카이제곱 검정 수행
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)

            # 카이제곱 통계량과 p-value 저장
            chi2_stats[feature] = {'chi2': chi2, 'p_value': p_value}

    chi2_stats_df = pd.DataFrame(chi2_stats).transpose()
    chi2_stats_df = chi2_stats_df.sort_values(by=['p_value'])

    chi2_stats_df.to_csv(file_path)

# Data Load

In [6]:
total_data = pd.read_csv('./database/data.csv')

label = pd.read_csv('./database/label.csv')

# Fetch Feature Importance

In [None]:
from xgboost import XGBClassifier

data = total_data.drop(columns=['Churn'])
label = total_data['Churn']

with open('./model/model.pkl', 'rb') as f:
        model = pickle.load(f)

# model = XGBClassifier()

# model.fit()


# shap value 변수 중요도
data = pd.read_csv(PREPROCESSED_DATA_PATH)
fetch_shap_value(data=data, model=model)

# chi square / p-value 변수 중요도
data = pd.read_csv(PREPROCESSED_DATA_PATH)
label = pd.read_csv(LABEL_DATA_PATH)
total_data = pd.concat([data, label], axis=1)
fetch_chi_square(data=total_data, label=data['Churn'], feature_names=CATEGORICAL_FEATURES)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
