In [24]:
# Feature Selection
from pandas import DataFrame, Index, read_csv
from dslabs_functions import (
    select_low_variance_variables,
    study_variance_for_feature_selection,
    apply_feature_selection,
    select_redundant_variables,
    study_redundancy_for_feature_selection,
)


def select_low_variance_variables(
    data: DataFrame, max_threshold: float, target: str = "class"
) -> list:
    summary5: DataFrame = data.describe()
    vars2drop: Index[str] = summary5.columns[
        summary5.loc["std"] * summary5.loc["std"] < max_threshold
    ]
    vars2drop = vars2drop.drop(target) if target in vars2drop else vars2drop
    return list(vars2drop.values)


target = "Churn"
file_tag = "churn_feature_selection"
#filename = "C:\\Users\\LENOVO\\Documents\\Projetos\\Data_science\\data_science\\data\\churn_truncate_outliers.csv"
filename = "C:\\Users\\LENOVO\\Documents\\Projetos\\Data_science\\data_science\\data\\churn_scaled_zscore.csv"
train: DataFrame = read_csv(filename, index_col = "Unnamed: 0")
#print(train.tail(10))
print(train.describe())

print("Original variables", train.columns.to_list())
vars2drop: list[str] = select_low_variance_variables(train, 3, target=target)
print("Variables to drop", vars2drop)

             tenure  InternetService  PaymentMethod  MonthlyCharges  \
count  6.649000e+03     6.649000e+03   6.649000e+03    6.649000e+03   
mean   4.942488e-17    -7.427090e-17  -1.047273e-16    3.633396e-17   
std    1.000075e+00     1.000075e+00   1.000075e+00    1.000075e+00   
min   -1.284217e+00    -1.513609e+00  -1.301934e+00   -1.517684e+00   
25%   -9.437527e-01    -2.276409e-01  -3.623313e-01   -1.103083e+00   
50%   -1.777081e-01    -2.276409e-01  -3.623313e-01    2.409174e-01   
75%    8.862427e-01     1.058327e+00   5.772712e-01    8.230870e-01   
max    1.779961e+00     1.058327e+00   1.516874e+00    1.909688e+00   

       TotalCharges        gender  SeniorCitizen       Partner    Dependents  \
count  6.649000e+03  6.649000e+03   6.649000e+03  6.649000e+03  6.649000e+03   
mean  -1.196884e-16  1.025900e-16  -4.595178e-17 -1.976995e-17  1.282375e-17   
std    1.000075e+00  1.000075e+00   1.000075e+00  1.000075e+00  1.000075e+00   
min   -1.017047e+00 -9.902714e-01  -4.33

In [22]:
from pandas import Series


def select_redundant_variables(
    data: DataFrame, min_threshold: float = 0.90, target: str = "class"
) -> list:
    df: DataFrame = data.drop(target, axis=1, inplace=False)
    corr_matrix: DataFrame = abs(df.corr())
    variables: Index[str] = corr_matrix.columns
    vars2drop: list = []
    for v1 in variables:
        vars_corr: Series = (corr_matrix[v1]).loc[corr_matrix[v1] >= min_threshold]
        vars_corr.drop(v1, inplace=True)
        if len(vars_corr) > 1:
            lst_corr = list(vars_corr.index)
            for v2 in lst_corr:
                if v2 not in vars2drop:
                    vars2drop.append(v2)
    return vars2drop


print("Original variables", train.columns.values)
vars2drop: list[str] = select_redundant_variables(
    train, target=target, min_threshold=0.5
)
print("Variables to drop", vars2drop)

Original variables ['tenure' 'InternetService' 'PaymentMethod' 'MonthlyCharges'
 'TotalCharges' 'gender' 'SeniorCitizen' 'Partner' 'Dependents'
 'PhoneService' 'MultipleLines' 'OnlineSecurity' 'OnlineBackup'
 'DeviceProtection' 'TechSupport' 'StreamingTV' 'StreamingMovies'
 'Contract_Month-to-month' 'Contract_One year' 'Contract_Two year' 'Churn']
Variables to drop ['TotalCharges', 'Contract_Month-to-month', 'Contract_Two year', 'InternetService', 'StreamingTV', 'StreamingMovies', 'tenure', 'MonthlyCharges', 'Contract_One year']
