In [None]:
!pip install feature-engine
!pip install category_encoders
!pip install shap
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from category_encoders import *
from sklearn.preprocessing import OneHotEncoder
from feature_engine import categorical_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import clone
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import check_cv, KFold
from category_encoders import CatBoostEncoder
import shap

In [None]:
from google.colab import files
files.upload() #this will prompt you to update the json

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json  # set permission

# Functions

## models_test
  Строятся модели Logreg and RandomForest
  
  model_type - тип модели. По стандарту Logreg
    
    1 - Logreg
    2 - RandomForest
  en_type тип эндкодинга(для вывода)

 Пример:

  models_test(X_train, X_test, y_train, y_test, en_type='catboost')

  -> catboost LogisticRegression ROC_AUC= 0.70414

In [None]:
def models_test(X_train, X_test, y_train, y_test, model_type=1, en_type='TEST', _plots=False): 

  if model_type == 1:
      clf=LogisticRegression(C=1, solver="lbfgs", max_iter=5000) 
      clf.fit(X_train, y_train)
      print(en_type, 'LogisticRegression ROC_AUC=', 
            metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))
      vectorizer = TfidfVectorizer(min_df=10)
      xplainer = shap.Explainer(clf, X_train, feature_names=vectorizer.get_feature_names())
      shap_values = explainer(X_test)
      if _plots:
        make_feature_plot(X_train, 'LogisticRegression', en_type, clf.coef_[0])
  elif model_type == 2:
    rfr = RandomForestRegressor(n_estimators = 100, random_state = 0) 
    rfr.fit(X_train, y_train)
    print(en_type, 'RandomForest ROC_AUC=',
          metrics.roc_auc_score(y_test, rfr.predict(X_test)))
    if _plots:
      make_feature_plot(X_train, 'RandomForestRegressor', en_type, rfr.feature_importances_)

## make_feature_plot
Делает график по фичам

In [None]:
def make_feature_plot(df, model_type, en_type, f_coefs):
    feature_importance = abs(f_coefs)
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    featfig = plt.figure()
    featax = featfig.add_subplot(1, 1, 1)
    featax.barh(pos, feature_importance[sorted_idx], align='center')
    featax.set_yticks(pos)
    featax.set_yticklabels(np.array(df.columns)[sorted_idx], fontsize=8)
    featax.set_xlabel(f'{model_type} {en_type} Relative Feature Importance')
    plt.tight_layout()
    plt.show()

## encode_ohe 
меняет данные с помощью OneHotEncoder - разбивает категориальные переменные на бинарные переменные принадлежности к категории

df - DataFrame

cols колонки, к которым применить

In [None]:
def encode_ohe(X_train, X_test, cols):
  # Produces 1,0 data columns corresponding to all the unique categorical entries in col columns list
  
  col_names = []
  
  for i in cols:
    for j in X_train[i].unique():
      col_names.append(f"{i}_{j}")

  OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
  OH_train = pd.DataFrame(OH_encoder.fit_transform(X_train[cols]), columns=col_names)
  OH_test = pd.DataFrame(OH_encoder.transform(X_test[cols]), columns=col_names)
  # OH encoding removes index in the data set. Putting the index back again
  OH_train.index = X_train.index
  OH_test.index = X_test.index
  # Remove cols columns (will replace with one-hot encoding)
  temp_train = X_train.drop(cols, axis=1)
  temp_test = X_test.drop(cols, axis=1)
  # Add one-hot encoded columns to the original data
  X_train = pd.concat([temp_train, OH_train], axis=1)
  X_test = pd.concat([temp_test, OH_test], axis=1)
  return X_train, X_test

## encode_rare
Group rare categories
df - DataFrame

cols - колонки, к которым применить

min_valut минимальная частота категории, иначе не значима => группируется

n_cat минимальное число категорий. если меньше - группировки не произойдет и будет варнинг
заменяем редкие значения на 'Rare'

In [None]:
def encode_rare(X_train, X_test, cols, min_val=0.05, n_cat=1):
  encoder = ce.RareLabelCategoricalEncoder(tol=min_val, variables=cols, n_categories=n_cat, replace_with='Rare')
  # fit the encoder Learns the frequent categories for each variable
  X_train =  encoder.fit_transform(X_train)
  X_test = encoder.fit_transform(X_test)
  return X_train, X_test

## encode_tgmean
Перекодирует категориальные переменные по mean таргета в категориях

cols - колонки, к которым применить

Пример:
X_train, X_test, y_train = encode_tgmean(X_train, X_test, y_train, ["room", "temp"])

In [None]:
def encode_tgmean(X_train, X_test, y_train, cols=None):
  # mean encoder Learns the mean value of the target for each category of the variable.
  encoder = ce.MeanCategoricalEncoder(variables=cols)

  # fit the encoder
  encoder.fit(X_train, y_train)

  # transform the data
  X_test = encoder.transform(X_test)
  X_train = encoder.transform(X_train)

  for col in X_train.columns:
    X_test[col] = X_test[col].fillna(value=X_train[col].mean(), inplace=False)
  return X_train, X_test, y_train

## encode_woe
Перекодирование на основе Weight of evidence:

log(частота события в классе/ частоса не-события в классе)

cols - колонки, к которым применить

Пример:
X_train, X_test, y_train = encode_woe(X_train, X_test, y_train, ["room", "temp"])

In [None]:
def encode_woe(X_train, X_test, y_train, cols=None):

  # set up a weight of evidence encoder
  woe_encoder = ce.WoERatioCategoricalEncoder(encoding_method='woe', variables=cols)

 # fit the encoder
  woe_encoder.fit(X_train, y_train)

 # transform
  X_train = woe_encoder.transform(X_train)
  X_test = woe_encoder.transform(X_test)
  
  return X_train, X_test, y_train

## encode_dtree
Категориальная сначала энкодится в ordinal, затем они заменяютя предиктами decision tree

cols - колонки, к которым применить

Пример:
X_train, X_test, y_train = encode_dtree(X_train, X_test, y_train, ["room", "temp"])

In [None]:
def encode_dtree(X_train, X_test, y_train, cols=None):
  # set up the encoder
  # replaces categories in the variable with the predictions of a decision tree
  encoder = ce.DecisionTreeCategoricalEncoder(random_state=0, variables=cols)

  # fit the encoder
  encoder.fit(X_train, y_train)
  
  # transform the data
  X_train = encoder.transform(X_train)
  X_test = encoder.transform(X_test)

  return X_train, X_test, y_train

## encode_cb
(countInclass + y.mean)/ (totalCount + a)

mean encoding с Additive smoothing, коэф a = 1, менять его нет необходимости


cols - колонки, к которым применить

Пример:
X_train, X_test, y_train = encode_cb(X_train, X_test, y_train, ["room", "temp"])

In [None]:
def encode_cb(X_train, X_test, y_train, col=None):
  # set up the encoder
  # replaces categories in the variable with the predictions of a decision tree
  encoder = CatBoostEncoder(cols=col)

  # fit the encoder
  encoder.fit(X_train, y_train)
  
  # transform the data
  X_train = encoder.transform(X_train)
  X_test = encoder.transform(X_test)

  return X_train, X_test, y_train

## encode_cbfold
Разбивает train на 5 частей, для каждой части находит mean с помощью CatBoost, исходя из оставшихся частей.


cols - колонки, к которым применить

Пример:
X_train, X_test, y_train, y_test = cb_fold(X_train, X_test, y_train, y_test, ["room", "temp"])

In [None]:
class TargetEncoderCV(BaseEstimator, TransformerMixin):

    def __init__(self, colns, cv, **cbe_params):
        self.cv = cv
        self.colns = colns
        self.cbe_params = cbe_params

    @property
    def _n_splits(self):
        return check_cv(self.cv).n_splits

    def fit_transform(self, X: pd.DataFrame, y) -> pd.DataFrame:
        self.cbe_ = []
        cv = check_cv(self.cv)

        cbe = CatBoostEncoder(
            cols=self.colns,
            return_df=False,
            **self.cbe_params
        )
        X_transformed = np.zeros_like(X[self.colns], dtype=np.float64)
        for train_idx, valid_idx in cv.split(X, y):
          self.cbe_.append(
              clone(cbe).fit(X[self.colns].loc[train_idx], y[train_idx])
              )
          X_transformed[valid_idx] = self.cbe_[-1].transform(X[self.colns].loc[valid_idx])

        return pd.concat([X.drop(self.colns, axis=1),
                          pd.DataFrame(X_transformed, columns=self.colns)], axis=1)

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_transformed = np.zeros_like(X[self.colns], dtype=np.float64)
        for cbe in self.cbe_:
            X_transformed += cbe.transform(X[self.colns]) / self._n_splits
        return pd.concat([X.drop(self.colns, axis=1),
                          pd.DataFrame(X_transformed, columns=self.colns)], axis=1)

def encode_cbfold(X_train, X_test, y_train, y_test, cols=None):
  te_cv = TargetEncoderCV(cols, KFold(n_splits=5))
  X_train = X_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)
  X_test = X_test.reset_index(drop=True)
 ## y_test = y_test.reset_index(drop=True)
  
  X_train = te_cv.fit_transform(X_train, y_train)
  X_test = te_cv.transform(X_test)

  return X_train, X_test, y_train, y_test

## encode_helm
https://contrib.scikit-learn.org/category_encoders/_modules/category_encoders/helmert.html#HelmertEncoder

Категориальная сначала переводится в Ordinal, а потом высчитывается mean на основе сравнения с mean "оставшихся категорий"

Например mean категории А, сравнивается с B, C, D
B с C,D  и т.д. На основе этого строится матрица хелмерта и по ней создаются новые категориальные переменные (как в OHE) https://www.ibm.com/support/knowledgecenter/en/SSLVMB_24.0.0/spss/common/catvar_coding_helmert.html

Наиболее полезен, когда категориальные переменные упорядочнены от мнеьшего к большему, или наоборот


cols - колонки, к которым применить

Пример:
X_train, X_test, y_train = encode_helm(X_train, X_test, y_train, ["room", "temp"])

In [None]:
def encode_helm(X_train, X_test, y_train, cols=None):
  encoder = HelmertEncoder(cols=cols, handle_unknown='value', handle_missing='value').fit(X_train, y_train)
  X_train = encoder.transform(X_train)
  X_test = encoder.transform(X_test)
  return X_train, X_test, y_train

# Основной раздел

## Загрузка данных
для примера использовались категориальные данные 
https://www.kaggle.com/c/cat-in-the-dat/overview

In [None]:
!kaggle competitions download -c house-prices-advanced-regression-techniques -p /drive/kaggle/houseprices

In [None]:
df=pd.read_csv("/drive/kaggle/houseprices/train.csv")
df1=pd.read_csv("/drive/kaggle/houseprices/test.csv")

In [None]:
df.drop(['Id'], axis=1)
df1.drop(['Id'], axis=1)

In [None]:

target=df["SalePrice"]
features=df.drop(['SalePrice', ], axis=1)
dfall=pd.concat([X_train, TEST])
dfall.info()

In [None]:
df.head(5)


In [None]:
corrmat = df.corr()

plt.figure(figsize=(10, 17))
sns.barplot(y=corrmat['SalePrice'].sort_values().index, x=corrmat['SalePrice'].sort_values().values)
plt.xlabel(f'correlation between SalePrice')
plt.show()

In [None]:
cat_col=[c for c in df.columns if df[c].dtypes=='object']
n_levels=df[cat_col].nunique()
print("cardinality of categorical columns:\n",n_levels)

In [None]:
def fill_missing_data(df: pd.DataFrame):
    for col_ in df.columns:
        if df[col_].dtype == 'object':
            # fill mode for categorical features
            df[col_].fillna('Rare', inplace=True)
        else:
            # fill median for numerical features
            df[col_].fillna(df[col_].median(), inplace=True)
        
fill_missing_data(df)
fill_missing_data(df1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features,target,test_size=0.4,random_state=0)


In [None]:
X_train = features
X_test = df1
y_train = target
y_test = None

## Пример работы с функциями

In [None]:
high_cardinal=[c for c in cat_col if features[c].nunique()>50]
low_cardinal=list(set(cat_col)-set(high_cardinal))

X_train, X_test = encode_ohe(X_train, X_test, low_cardinal);
cat_col=list(set(cat_col)-set(low_cardinal))

In [None]:
print(low_cardinal) 

In [None]:
def models_test(X_train, X_test, y_train, y_test, model_type=1, en_type='TEST', _plots=False, C_par=1.0): 

  xgb = XGBRegressor()

  # test_size = 0
  xgb.fit(X_train, y_train)
  print('R^2 =', xgb.score(X_train, y_train))

  pred = xgb.predict(X_test)
    
  subm_df = pd.read_csv('/drive/kaggle/houseprices/sample_submission.csv')
  subm_df['SalePrice'] = pred
  subm_df.to_csv(f'{en_type}_submission.csv', index=False)

In [None]:
X_train, X_test, y_train, y_test = encode_cbfold(X_train, X_test, y_train, y_test, cols=cat_col)

In [None]:
models_test(X_train, X_test, y_train, y_test, en_type='cbfold_logreg_hp', _plots=True, C_par=4.690615956503463)

In [None]:
xgb = XGBRegressor()

# test_size = 0
xgb.fit(X_train, y_train)
print('R^2 =', xgb.score(X_train, y_train))

ypred = xgb.predict(X_test)
print('Score =', metrics.mean_squared_error(np.log(y_test), np.log(ypred), squared=False))

In [None]:
models_test(X_train, X_test, y_train, y_test, en_type='cb', _plots=True)
#cb_fold LogisticRegression ROC_AUC= 0.793521519143426  helmert+ohe+cb_fold
#cb_fold LogisticRegression ROC_AUC= 0.7932658732871862 ohe+cb_fold(onlycat)
#cb_fold LogisticRegression ROC_AUC= 0.7963710225797453 cb_fold
#cb_fold LogisticRegression ROC_AUC= 0.7960390000124247 ohe+cb_fold(all)
#cb LogisticRegression ROC_AUC= 0.7743017494958631 ohe+cb only use
#cb LogisticRegression ROC_AUC= 0.774775144144356 helmert+ohe+cb

In [None]:
models_test(X_train, X_test, y_train, y_test, 2, en_type='cb_fold', _plots=True)

In [None]:
clf=LogisticRegression(C=1, solver="lbfgs", max_iter=5000) 
clf.fit(X_train, y_train)
print('LogisticRegression ROC_AUC=', 
      metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))