### Correlation HeatMap
Beautiful HeatMap for showing feature correlation

In [2]:
def make_btf_heatmap(df_corrs, title):
    fig, ax = plt.subplots(figsize=(12, 10))
    mask = np.triu(np.ones_like(df_corrs, dtype=np.bool))
    mask = mask[1:, :-1]
    corr = df_corrs.iloc[1:,:-1].copy()
    cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)
    sns.heatmap(corr,
                mask=mask, 
                annot=True,
                fmt=".2f",
                linewidths=5,
                cmap=cmap,
                vmin=-1,
                vmax=1, 
                cbar_kws={"shrink": .8},
                square=True)
    
    yticks = [i.upper() for i in corr.index]
    xticks = [i.upper() for i in corr.columns]
    plt.yticks(plt.yticks()[0], labels=yticks, rotation=0)
    plt.xticks(plt.xticks()[0], labels=xticks)
    plt.title(title, loc='left', fontsize=18)
    plt.show()

### Progress Bar
Shows progress bar when iterating over objects

In [4]:
from tqdm.notebook import tqdm as log_progress
for i in log_progress(range(1)):
    print(i)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

0



### NaN Ratio Hist
Plots a histogram of NaN values for features

In [1]:
def plot_nan_ration(df, figsize=(10,5), title='Numerical Features'):
    """
    Plots a histogram with NaN ration and its features 
    It only takes into account the fact that the target features is located in the end of a DataFrame!
    
    """
    fig = plt.figure(figsize=figsize)
    plt.hist(df.iloc[:,:-1].isna().sum()/df.shape[0])
    plt.xlabel('NaN Fraction')
    plt.ylabel('Number of Features')
    plt.title(title)
    plt.grid();

### Multiple Plot (Bar, Count...)

In [1]:
# fig, axes = plt.subplots(7, 3, figsize=(25, 25))
# for indx, feature in enumerate(cat_features[columns].iloc[:, :-1]):
#     sns.countplot(data=cat_features[columns], x=feature, hue='is_churn', ax=axes[indx // 3, indx % 3])

### Multidimensional Data Plotting 


In [2]:
from category_encoders.cat_boost import CatBoostEncoder
from sklearn.decomposition import PCA

# Функция для визуализации многомерной выборки
def plot_multidim_data(x_train,
                       y_train, 
                       cat_bin_imputer=SimpleImputer(strategy='most_frequent'),
                       cat_bin_encoder=CatBoostEncoder(random_state=SEED),
                       num_imputer=SimpleImputer(strategy='mean'),
                       num_scaler=StandardScaler(),
                       dim_red_method=PCA(n_components=2),
                       sampling_method=None,
                       fig_size=(12,6),
                       title='2D Feature Space (PCA)'):
    """
    x_train - matrix of features that includes nominal features (categorical and binary features) and continious features
    y_train - target feature
    
    x_train is then splitted according features types. Each feature is then processed separately
    
    Continuous Features must have int64 or float64 data type;
    Nominal Features must have data type Object;
    
    """
    
    # Features Selection
    num_features = x_train.select_dtypes(include=['float64', 'int64'])
    cat_bin_features = x_train.select_dtypes(include=['object'])
    
    # Categorical/Binary Features Processing 
    cat_bin_features = cat_bin_imputer.fit_transform(cat_bin_features)
    cat_bin_features = cat_bin_encoder.fit_transform(cat_bin_features, y_train)
    
    # Numerical Features Processing
    num_features = num_imputer.fit_transform(num_features)
    num_features = num_scaler.fit_transform(num_features)

    # Final Matrix of Features
    final_matrix = np.hstack((num_features, cat_bin_features))
    
    target_name = y_train.name
    
    # If there is a sampling method (over/down)
    if sampling_method:
        final_matrix, y_train = sampling_method.fit_resample(final_matrix, y_train.values)
        
    final_matrix_2d = pd.DataFrame(dim_red_method.fit_transform(final_matrix))
    final_matrix_2d[target_name] = y_train
    
    print('Number of Samples (Negative Class): ', final_matrix_2d[final_matrix_2d[target_name] == np.unique(y_train).min()].shape[0])
    print('Number of Samples (Positive Class): ', final_matrix_2d[final_matrix_2d[target_name] == np.unique(y_train).max()].shape[0])
    
    # Plotting
    plt.figure(figsize=fig_size)
    plt.scatter(final_matrix_2d[final_matrix_2d[target_name] == np.unique(y_train)[0]][0], 
                final_matrix_2d[final_matrix_2d[target_name] == np.unique(y_train)[0]][1],
                label=np.unique(y_train)[0])
    
    plt.scatter(final_matrix_2d[final_matrix_2d[target_name] == np.unique(y_train)[1]][0], 
                final_matrix_2d[final_matrix_2d[target_name] == np.unique(y_train)[1]][1],
                label=np.unique(y_train)[1])
    plt.xlabel('First Component')
    plt.ylabel('Second Component')
    plt.title(title)
    plt.grid()
    plt.legend()
    
# %%time
# plot_multidim_data(x_train=x_train,
#                    y_train=y_train, 
#                    cat_bin_imputer=SimpleImputer(strategy='constant', fill_value='unknown'),
#                    cat_bin_encoder=CatBoostEncoder(random_state=SEED),
#                    num_imputer=SimpleImputer(strategy='mean'),
#                    num_scaler=StandardScaler(),
#                    dim_red_method=PCA(n_components=2, random_state=SEED),
#                    fig_size=(25,15), 
#                    title='2D Feature Space (PCA)')

NameError: name 'SimpleImputer' is not defined