In [0]:
%run ./utils

In [0]:
import numpy as np
import pandas as pd
from dython import nominal

from pprint import pformat

from sklearn.feature_selection import f_regression
from scipy import stats
from phik import phik_from_array

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-1220897062489237>:3[0m
[1;32m      1[0m [38;5;28;01mimport[39;00m [38;5;21;01mnumpy[39;00m [38;5;28;01mas[39;00m [38;5;21;01mnp[39;00m
[1;32m      2[0m [38;5;28;01mimport[39;00m [38;5;21;01mpandas[39;00m [38;5;28;01mas[39;00m [38;5;21;01mpd[39;00m
[0;32m----> 3[0m [38;5;28;01mfrom[39;00m [38;5;21;01mdython[39;00m [38;5;28;01mimport[39;00m nominal
[1;32m      4[0m [38;5;66;03m# from typing import List[39;00m
[1;32m      5[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpprint[39;00m [38;5;28;01mimport[39;00m pformat

File [0;32m/databricks/python_shell/dbruntime/PythonPackageImportsInstrumentation/__init__.py:171[0m, in [0;36m_create_import_patch.<locals>.import_patch[0;34m(name, globals, locals, fromlist, level)[0m
[1;32m    166[0m thread_local[38;5;241m.

In [0]:
np.__version__



In [0]:
TARGET_COL = "ConvertedCompYearly"



In [0]:
def filter_cols_with_one_value(*, df: DataFrame) -> List[str]:
    print("==== Drop columns with only one value ====\n")
    # apply countDistinct on each column
    col_counts = (
        df.agg(*(f.countDistinct(f.col(c)).alias(c) for c in df.columns))
        .collect()[0]
        .asDict()
    )
    # keep the cols with count > 1
    cols_to_keep = [c for c in df.columns if col_counts[c] > 1]
    cols_to_drop = [c for c in df.columns if col_counts[c] == 1]
    print(
        f"=== Number of features in the dataset: {len(df.columns) -1} .   Number of remaining features: {len(cols_to_keep) -1} ===\n"
    )
    msg = pformat(cols_to_drop)
    print(f"Columns to drop:\n{msg}\n")
    filtered_df = df.drop(*cols_to_drop)
    return filtered_df



In [0]:
# Since the target is not normally distributed we will use the mannwhitneyu test of p-value for selecting significant binary features
def filter_binary_cols(*, pdf: pd.DataFrame) -> List[str]:
    print("==== Mannwhitneyu U test for binary features to Target ====\n")
    results = []
    for column in pdf.columns.to_list():
        if column == TARGET_COL:
            continue
        yes = pdf.loc[pdf[column] == 1, TARGET_COL]
        no = pdf.loc[pdf[column] == 0, TARGET_COL]
        statisic, p_val = stats.mannwhitneyu(yes, no)
        results.append((column, p_val))

    u_df = pd.DataFrame(results, columns=["feature", "p-value"])
    binary_stored_features = u_df[u_df["p-value"] < 0.05].sort_values(by="p-value")
    print(binary_stored_features)
    cols_to_keep = binary_stored_features["feature"].to_list()
    cols_to_drop = [c for c in pdf.columns.to_list() if c not in cols_to_keep]
    print(
        f"\n=== Number of features in the binary dataset: {len(pdf.columns.to_list()) -1} .   Number of remaining features: {len(cols_to_keep)} ===\n"
    )
    msg = pformat(cols_to_drop)
    print(f"Binary columns to drop: \n{msg}\n")
    keep_bin_pdf = pdf[cols_to_keep + [TARGET_COL]]
    return keep_bin_pdf



In [0]:
# Pearson correlation for continues features
def filter_cols_corr_with_target(
    *,
    pdf: pd.DataFrame,
    min_threshold: float = 0.1,
    max_threshold: float = 0.9,
    typ: str = "numeric",
) -> List[str]:
    print("==== Pearson correlation to Target ====\n")
    if typ == "categorical":
        corr_to_target_series = pdf[TARGET_COL].abs().sort_values(ascending=False)
    else:
        corr_to_target_series = (
            pdf.corrwith(pdf[TARGET_COL]).abs().sort_values(ascending=False)
        )
    cols_to_keep = corr_to_target_series[
        (corr_to_target_series > min_threshold)
        & (corr_to_target_series < max_threshold)
    ].index.to_list()
    cols_to_drop = [c for c in pdf.columns.to_list() if c not in cols_to_keep]
    print(
        f"=== Number of features in the {typ} dataset: {len(pdf.columns.to_list()) -1} .   Number of remaining features: {len(cols_to_keep) -1} ===\n"
    )
    msg = pformat(cols_to_drop)
    print(f"Columns to drop:\n{msg}\n")
    return cols_to_keep



In [0]:
def filter_multicullinearity_cols(
    *, pdf: DataFrame, multi_threshold: float = 0.5, typ: str = "numeric"
) -> List[str]:
    print("==== Multicullinearity with other features ====\n")
    if typ == "categorical":
        corr_matrix = pdf.abs()
    else:
        corr_matrix = pdf.corr().abs()
    # Select upper triangle of correlation matrix
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    cols_to_drop = [
        column
        for column in upper_triangle.columns
        if any(upper_triangle[column] > multi_threshold)
    ]
    cols_to_keep = [c for c in pdf.columns.to_list() if c not in cols_to_drop]
    print(
        f"=== Number of features in the {typ} dataset: {len(pdf.columns.to_list()) -1} .   Number of remaining features: {len(cols_to_keep) -1} ===\n"
    )
    msg = pformat(cols_to_drop)
    print(
        f"Features to drop because of multicullinearity with other features: \n{msg}\n"
    )
    return cols_to_keep



In [0]:
def select_num_features(
    *,
    pdf: pd.DataFrame,
    min_threshold: float = 0.1,
    max_threshold: float = 0.9,
    multi_threshold: float = 0.5,
    typ: str = "numeric",
):
    print(f"==== Selecting {typ} featuers ====\n")
    cols_to_keep = filter_cols_corr_with_target(
        pdf=pdf, min_threshold=min_threshold, max_threshold=max_threshold, typ=typ
    )
    keep_pdf = pdf[cols_to_keep]
    selected_cols = filter_multicullinearity_cols(
        pdf=keep_pdf, multi_threshold=multi_threshold, typ=typ
    )
    return selected_cols



In [0]:
def setup_cat_corr(*, pdf: pd.DataFrame) -> pd.DataFrame:
    assoc = nominal.associations(dataset=pdf, plot=False)
    corr_pdf = assoc["corr"].copy()
    cols_names = [c.split(" (")[0] for c in corr_pdf.columns.to_list()]
    corr_pdf.columns = cols_names
    corr_pdf.index = cols_names
    return corr_pdf



In [0]:
def select_cat_features(
    *,
    pdf: pd.DataFrame,
    min_threshold: float = 0.1,
    max_threshold: float = 0.9,
    multi_threshold: float = 0.5
):
    print("==== Selecting categorical featuers ====\n")
    corr_pdf = setup_cat_corr(pdf=pdf)
    cols_to_keep = filter_cols_corr_with_target(
        pdf=corr_pdf,
        min_threshold=min_threshold,
        max_threshold=max_threshold,
        typ="categorical",
    )
    keep_pdf = pdf[cols_to_keep]
    keep_corr_pdf = setup_cat_corr(pdf=keep_pdf)
    selected_cols = filter_multicullinearity_cols(
        pdf=keep_corr_pdf, multi_threshold=multi_threshold, typ="categorical"
    )
    return selected_cols



In [0]:
def select_features(
    *,
    df: DataFrame,
    id_col: str = "ResponseId",
    min_threshold: float = 0.1,
    max_threshold: float = 0.9,
    multi_threshold: float = 0.5,
) -> DataFrame:
    """The main function for running features selection on all columns data types"""
    filtered_df = filter_cols_with_one_value(df=df.drop(id_col))
    cols_types: Dict = get_cols_by_dtypes(df=filtered_df)
    bin_pdf = filtered_df.select(*cols_types["bin_cols"]).toPandas()
    num_pdf = filtered_df.select(*cols_types["num_cols"], TARGET_COL).toPandas()
    cat_pdf = filtered_df.select(*cols_types["cat_cols"], TARGET_COL).toPandas()

    keep_bin_pdf = filter_binary_cols(pdf=bin_pdf)
    selected_bin_features = select_num_features(pdf=keep_bin_pdf, typ="binary")
    selected_num_features = select_num_features(pdf=num_pdf)
    selected_cat_features = select_cat_features(pdf=cat_pdf)
    selected_cols = list(
        chain(
            [id_col],
            selected_cat_features,
            selected_num_features,
            selected_bin_features,
            [TARGET_COL],
        )
    )
    print("==== Selecting all featuers ====\n")
    print(
        f"=== Number of features in the original dataset: {len(df.columns) -1} .   Number of remaining features: {len(selected_cols) -1} ===\n"
    )
    msg = pformat(selected_cols)
    print(f"The final selected features are: \n{msg}")
#     selected_features = [{"feature": column} for column in selected_cols]
#     selected_df = spark.createDataFrame(selected_features)
    selected_df = df.select(*selected_cols)
    save_table(
        df=selected_df, file_path=f"s3a://{S3_PROCESS_PATH}selected_features.parquet"
    )
    return None

