In [None]:
import pandas as pd
import numpy as np
import random
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
# Definindo a semente
random.seed(13)

In [None]:
import numpy             as np
import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, auc
from sklearn.linear_model import LogisticRegression

In [None]:
# Função para gerar metadados
def pod_academy_generate_metadata(dataframe):
    metadata = pd.DataFrame({
        'nome_variavel': dataframe.columns,
        'tipo': dataframe.dtypes,
        'qt_nulos': dataframe.isnull().sum(),
        'percent_nulos': round((dataframe.isnull().sum() / len(dataframe)) * 100, 2),
        'cardinalidade': dataframe.nunique(),
    })
    metadata = metadata.sort_values(by='percent_nulos', ascending=False)
    metadata = metadata.reset_index(drop=True)
    return metadata

# Função para filtrar colunas com alto percentual de nulos
def filter_high_null_columns(df, missing_cutoff):
    metadata_df = pod_academy_generate_metadata(df)
    df_drop_nulos = metadata_df[metadata_df['percent_nulos'] >= missing_cutoff]
    lista_drop_nulos = list(df_drop_nulos.nome_variavel.values)
    df_filtered = df.drop(columns=lista_drop_nulos)
    return df_filtered

# Função para converter colunas específicas para int
def convert_columns_to_int(df, columns_to_convert):
    for col in columns_to_convert:
        df[col] = df[col].astype(int)
    return df

# Função para remover features altamente correlacionadas
def remove_highly_correlated_features(df, id_column, target_column, threshold):
    object_columns = df.select_dtypes(include=['object']).columns
    df_object = df[object_columns]
    df_id_target = df[[id_column, target_column]]
    df_numeric = df.drop(columns=object_columns.tolist() + [id_column, target_column])
    corr_matrix = df_numeric.corr().abs()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    df_reduced = df_numeric.drop(columns=to_drop)
    df_reduced = pd.concat([df_id_target, df_reduced, df_object], axis=1)
    return df_reduced, to_drop

# Função para tratar missings e aplicar encoding
def handle_missing_and_encode(X):
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    cat_attributes = X.select_dtypes(include='object')
    num_attributes = X.select_dtypes(exclude='object')
    num_imputed = imputer_num.fit_transform(num_attributes)
    cat_imputed = imputer_cat.fit_transform(cat_attributes)
    df_num = pd.DataFrame(num_imputed, columns=num_attributes.columns)
    df_cat = pd.DataFrame(cat_imputed, columns=cat_attributes.columns)
    label_encoder = LabelEncoder()
    for obj in cat_attributes.columns:
        df_cat[obj] = label_encoder.fit_transform(df_cat[obj].astype(str))
    return pd.concat([df_num, df_cat], axis=1)

# Função para calcular a importância das features
def compute_feature_importance(X, y):
    algoritmo = GradientBoostingClassifier(random_state=0)
    algoritmo.fit(X, y)
    feature_importances = algoritmo.feature_importances_
    features = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    })
    return features

# Função para selecionar features baseadas na importância
def select_important_features(features, cutoff_maximp):
    cutoff = cutoff_maximp * features['Importance'].max()
    selected_features = features[features['Importance'] > cutoff]
    return selected_features

# Função principal para selecionar variáveis
def vars_selection(df, missing_cutoff, corr_threshold, cutoff_maximp, sample_size=100000):
    amostra = df.sample(n=sample_size, random_state=13)
    amostra = filter_high_null_columns(amostra, missing_cutoff)
    columns_to_convert = ['CNT_CHILDREN', 'TARGET', 'FLAG_MOBIL', 'FLAG_WORK_PHONE',
                          'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL'] + \
                         [col for col in amostra.columns if col.startswith('FLAG_DOCUMENT_')]
    amostra = convert_columns_to_int(amostra, columns_to_convert)
    amostra, dropped_corr = remove_highly_correlated_features(amostra, 'SK_ID_CURR', 'TARGET', corr_threshold)
    X = amostra.drop(columns=['SK_ID_CURR', 'TARGET'])
    y = amostra['TARGET']
    X = handle_missing_and_encode(X)
    features = compute_feature_importance(X, y)
    selected_features = select_important_features(features, cutoff_maximp)
    return selected_features

# Função para plotar a importância das features
def plot_feature_importance(selected_features):
    selected_features = selected_features.sort_values(by='Importance', ascending=True)
    plt.figure(figsize=(10, len(selected_features)*0.4))
    plt.barh(selected_features['Feature'], selected_features['Importance'], color=(0.25, 0.5, 1))
    plt.xlabel("Feature Importance")
    plt.title("Variáveis Selecionadas - Gradient Boosting")
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
df_treino_full = pd.read_parquet('PoD Bank/Tabelas - Feature_Engineering/ABT/train.parquet')
df_teste_full = pd.read_parquet('PoD Bank/Tabelas - Feature_Engineering/ABT/test.parquet')
df_validacao_full = pd.read_parquet('PoD Bank/Tabelas - Feature_Engineering/ABT/validation.parquet')

In [None]:

selected_features_df = vars_selection(df_treino_full, 75, 0.85, 0)
df_train = df_treino_full[list(selected_features_df.Feature) + ['TARGET']]
df_test = df_teste_full[list(selected_features_df.Feature)+ ['TARGET']]
df_valid = df_validacao_full[list(selected_features_df.Feature)]

In [None]:
df_train.shape, df_test.shape, df_valid.shape

((150580, 130), (64677, 130), (92254, 129))

In [None]:
df_train.to_parquet('abt_train_fselect.parquet')
df_test.to_parquet('abt_test_fselect.parquet')
df_valid.to_parquet('abt_valid_fselect.parquet')