# Imports

In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# File and system libraries
import gzip
import shutil
import os

print("Libraries imported successfully.")

Libraries imported successfully.


# Unzip

In [2]:
gzipped_file_path = r'C:\\Users\\renat\\Documents\\Neurotech\\Project-4\\base_antifraude.gz'
output_file_path = 'base_antifraude.csv'

# Check if the compressed file exists before proceeding
if not os.path.exists(gzipped_file_path):
    print(f"Error: The file '{gzipped_file_path}' was not found.")
else:
    try:
        print(f"Decompressing '{gzipped_file_path}'...")

        with gzip.open(gzipped_file_path, 'rb') as f_in:
            with open(output_file_path, 'wb') as f_out:
                # Copy the decompressed content from the input to the output file
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Success! File decompressed to '{output_file_path}'")
        
    except Exception as e:
        print(f"An error occurred during decompression: {e}")

Decompressing 'C:\\Users\\renat\\Documents\\Neurotech\\Project-4\\base_antifraude.gz'...
Success! File decompressed to 'base_antifraude.csv'


# Leitura e Teste de Sanidade

In [3]:
csv_file_path = output_file_path

try:
    data_frame = pd.read_csv(csv_file_path, sep='\t')
    
    print(f"Success! File '{csv_file_path}' loaded into a DataFrame.")
    
    display(data_frame.head())
    
except FileNotFoundError:
    print(f"Error: The file '{csv_file_path}' was not found. Please ensure the previous cell ran successfully.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

Success! File 'base_antifraude.csv' loaded into a DataFrame.


  data_frame = pd.read_csv(csv_file_path, sep='\t')


Unnamed: 0,id,documento,mes_ref,alvo,vlr_financiado,VAR1,VAR2,VAR3,VAR4,VAR5,...,VAR191,VAR192,VAR193,VAR194,VAR195,VAR196,VAR197,VAR198,VAR199,VAR200
0,4db9c75f6a31c73414ad84fdd101b5d7,d92fa549809f55b4ea13caa1b108896f,202307,0.0,17820.6,0.166667,0.0,0.0,1.0,,...,0.609314,0.0,0.09579,,0.066331,,0.666667,0.135065,SUDESTE,0.0
1,9b07d4001632189d19de964894501e34,1af133c98723189f24bbfcb75abfcf92,202307,0.0,18513.56,0.166667,0.0,0.0,0.027211,,...,,,,,,,0.666667,,NORTE,0.0
2,d55b6a81203f5a5ae6f362c483f36ff1,9e95be945b56ff10dc43df22381c9002,202307,0.0,22943.8,0.0,0.0,0.0,0.013605,,...,,,,,,,0.333333,,SUL,0.0
3,f6bb36049c3b42a62dfa88c46e0f79d2,50502098d83c37b5fa60241bc4bd6641,202307,0.0,19177.58,0.083333,0.0,0.0,0.020408,,...,,,,,,,0.833333,,NORTE,0.0
4,2755ed6330b4980ea24faa1081183dc7,aad24a7552d24b46188103f4249f17e7,202307,0.0,16644.85,0.0,0.0,0.0,0.0,,...,0.045513,,0.532964,,,,0.5,,NORDESTE,0.0


In [4]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cell to load the data.")
else:
    print("--- Shape Validation ---")
    
    expected_rows = 47732
    expected_cols = 205
    
    actual_rows, actual_cols = data_frame.shape
    
    print(f"Actual shape: {data_frame.shape}")
    print(f"Expected shape: ({expected_rows}, {expected_cols})")
    
    if actual_rows == expected_rows and actual_cols == expected_cols:
        print("✅ Shape validation passed!\n")
    else:
        print("⚠️ Shape validation failed!\n")

    print("--- Column Presence Validation ---")


    required_columns = ['id', 'documento', 'mes_ref', 'vlr_financiado', 'alvo']
    
    print(f"Checking for required columns: {required_columns}")
    
    missing_columns = set(required_columns) - set(data_frame.columns)
    
    if not missing_columns:
        print("✅ Column validation passed! All required columns are present.")
    else:
        print(f"⚠️ Column validation failed! Missing columns: {list(missing_columns)}")

--- Shape Validation ---
Actual shape: (47732, 205)
Expected shape: (47732, 205)
✅ Shape validation passed!

--- Column Presence Validation ---
Checking for required columns: ['id', 'documento', 'mes_ref', 'vlr_financiado', 'alvo']
✅ Column validation passed! All required columns are present.


# Limpeza Inicial

In [5]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    print("--- DataFrame Info ---")
    data_frame.info()

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47732 entries, 0 to 47731
Columns: 205 entries, id to VAR200
dtypes: float64(190), int64(1), object(14)
memory usage: 74.7+ MB


In [6]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    print("--- Descriptive Statistics for Numerical Columns ---")
    display(data_frame.describe())

--- Descriptive Statistics for Numerical Columns ---


Unnamed: 0,mes_ref,alvo,vlr_financiado,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,...,VAR190,VAR191,VAR192,VAR193,VAR194,VAR195,VAR196,VAR197,VAR198,VAR200
count,47732.0,47732.0,47594.0,26947.0,26947.0,40926.0,42871.0,2991.0,47389.0,6047.0,...,8815.0,18139.0,1894.0,22829.0,2991.0,2238.0,7704.0,35629.0,3838.0,44930.0
mean,202309.288653,0.041607,19183.781266,0.201139,0.156529,0.031447,0.501378,0.3063411,0.0,0.275528,...,0.2464883,0.188585,0.05755,0.2751056,0.393977,0.2934698,0.308971,0.489437,0.259947,0.054422
std,1.64636,0.199692,5535.485493,0.253286,0.293852,0.174525,0.483991,0.2127711,0.0,0.203745,...,0.2260249,0.186897,0.178369,0.2133877,0.224401,0.2226813,0.20197,0.180096,0.189631,0.165324
min,202307.0,0.0,2427.88,0.0,0.0,0.0,0.0,5.601228e-07,0.0,6.3e-05,...,1.414208e-07,2.5e-05,0.0,2.200547e-07,1e-06,3.03874e-07,1e-06,0.0,6.2e-05,0.0
25%,202308.0,0.0,15742.62,0.0,0.0,0.0,0.0,0.142932,0.0,0.125406,...,0.08405769,0.059845,0.0,0.1235316,0.218558,0.1078872,0.163283,0.333333,0.13012,0.0
50%,202309.0,0.0,18836.81,0.083333,0.0,0.0,0.47619,0.2626918,0.0,0.229391,...,0.1814668,0.128681,0.0,0.2230746,0.37724,0.2409756,0.277927,0.5,0.211768,0.0
75%,202311.0,0.0,22276.5,0.333333,0.0,0.0,1.0,0.420874,0.0,0.352966,...,0.3314579,0.259011,0.0,0.3699415,0.561068,0.4509163,0.40929,0.5,0.336232,0.0
max,202312.0,1.0,91283.65,1.0,1.0,1.0,1.0,1.000001,0.0,1.000063,...,1.0,1.000025,1.0,1.0,1.000001,1.0,1.000001,1.0,1.000062,1.0


In [7]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    print("--- Total Null Values per Column ---")
    
    null_counts = data_frame.isnull().sum()
    
    display(null_counts[null_counts > 0])
    
    print("\n" + "="*50 + "\n") # Separator for clarity
    
    print("--- Top 20 Columns with the Most Null Values ---")
    display(null_counts.sort_values(ascending=False).head(20))

--- Total Null Values per Column ---


vlr_financiado      138
VAR1              20785
VAR2              20785
VAR3               6806
VAR4               4861
                  ...  
VAR196            40028
VAR197            12103
VAR198            43894
VAR199             6806
VAR200             2802
Length: 201, dtype: int64



--- Top 20 Columns with the Most Null Values ---


VAR87     47676
VAR24     47406
VAR157    47263
VAR121    47258
VAR148    46926
VAR60     46891
VAR21     46855
VAR178    46855
VAR72     46628
VAR27     46508
VAR74     46503
VAR169    46499
VAR118    46410
VAR137    46296
VAR47     46244
VAR53     46244
VAR143    46231
VAR55     46089
VAR162    46089
VAR77     46032
dtype: int64

In [8]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    column_to_check = 'alvo'

    if column_to_check not in data_frame.columns:
        print(f"Error: Column '{column_to_check}' does not exist in the DataFrame.")
        print(f"Available columns are: {data_frame.columns.tolist()}")
    else:
        nan_count = data_frame[column_to_check].isnull().sum()
        
        if nan_count > 0:
            total_rows = len(data_frame)
            
            nan_percentage = (nan_count / total_rows) * 100
            
            print(f"Analysis for column: '{column_to_check}'")
            print(f"Number of NaN values: {nan_count}")
            print(f"Percentage of total rows: {nan_percentage:.2f}%")
        else:
            print(f"✅ Column '{column_to_check}' has no NaN values.")

✅ Column 'alvo' has no NaN values.


In [9]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    total_rows = len(data_frame)
    threshold = total_rows * 0.5
    
    null_counts = data_frame.isnull().sum()
    
    columns_to_drop = null_counts[null_counts > threshold].index.tolist()
    
    if columns_to_drop:
        print(f"Threshold for dropping is > {int(threshold)} null values.")
        print(f"Found {len(columns_to_drop)} columns to remove: {columns_to_drop}")
        
        # We are modifying the original dataframe in this step
        data_frame.drop(columns=columns_to_drop, inplace=True)
        
        print(f"\nColumns removed successfully.")
        print(f"New DataFrame shape: {data_frame.shape}")
        
    else:
        print("No columns found with more than 50% null values.")

    print("\n" + "="*50 + "\n")
    print("--- Re-evaluating Null Values After Removal ---")
    
    remaining_null_counts = data_frame.isnull().sum()
    columns_with_remaining_nulls = remaining_null_counts[remaining_null_counts > 0]
    num_cols_with_nulls = len(columns_with_remaining_nulls)
    
    if num_cols_with_nulls > 0:
        print(f"There are now {num_cols_with_nulls} columns with remaining null values.")
        display(columns_with_remaining_nulls.sort_values(ascending=False))
    else:
        print("✅ No more columns with null values remain in the dataset.")

Threshold for dropping is > 23866 null values.
Found 139 columns to remove: ['VAR5', 'VAR7', 'VAR8', 'VAR9', 'VAR12', 'VAR13', 'VAR14', 'VAR15', 'VAR16', 'VAR17', 'VAR19', 'VAR20', 'VAR21', 'VAR23', 'VAR24', 'VAR25', 'VAR26', 'VAR27', 'VAR28', 'VAR29', 'VAR30', 'VAR34', 'VAR35', 'VAR37', 'VAR38', 'VAR39', 'VAR40', 'VAR42', 'VAR43', 'VAR45', 'VAR46', 'VAR47', 'VAR48', 'VAR49', 'VAR50', 'VAR51', 'VAR53', 'VAR54', 'VAR55', 'VAR56', 'VAR57', 'VAR59', 'VAR60', 'VAR61', 'VAR64', 'VAR65', 'VAR67', 'VAR68', 'VAR71', 'VAR72', 'VAR73', 'VAR74', 'VAR75', 'VAR77', 'VAR79', 'VAR80', 'VAR81', 'VAR82', 'VAR83', 'VAR84', 'VAR85', 'VAR86', 'VAR87', 'VAR88', 'VAR89', 'VAR90', 'VAR91', 'VAR92', 'VAR94', 'VAR95', 'VAR96', 'VAR97', 'VAR98', 'VAR102', 'VAR105', 'VAR108', 'VAR110', 'VAR111', 'VAR113', 'VAR115', 'VAR116', 'VAR118', 'VAR120', 'VAR121', 'VAR123', 'VAR125', 'VAR126', 'VAR129', 'VAR130', 'VAR131', 'VAR133', 'VAR134', 'VAR135', 'VAR136', 'VAR137', 'VAR138', 'VAR139', 'VAR142', 'VAR143', 'VAR145', 

VAR31             23734
VAR127            23150
VAR119            22819
VAR33             20785
VAR36             20785
                  ...  
VAR62               343
VAR114              343
VAR128              343
VAR155              343
vlr_financiado      138
Length: 62, dtype: int64

In [10]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    print("--- Distribution of Data Types in the DataFrame ---")

    dtype_counts = data_frame.dtypes.value_counts()
    
    display(dtype_counts)

--- Distribution of Data Types in the DataFrame ---


float64    51
object     14
int64       1
Name: count, dtype: int64

In [11]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    object_columns = data_frame.select_dtypes(include=['object']).columns

    print(f"--- Showing first 5 rows for the {len(object_columns)} 'object' columns ---")
    display(data_frame[object_columns].head())

--- Showing first 5 rows for the 14 'object' columns ---


Unnamed: 0,id,documento,VAR31,VAR41,VAR52,VAR100,VAR103,VAR104,VAR112,VAR119,VAR122,VAR175,VAR185,VAR199
0,4db9c75f6a31c73414ad84fdd101b5d7,d92fa549809f55b4ea13caa1b108896f,N,ALTA,F,LONGE,MEDIA,S,ALTA,C,N,A,ALTA,SUDESTE
1,9b07d4001632189d19de964894501e34,1af133c98723189f24bbfcb75abfcf92,,ALTISSIMA,,PROXIMO,SEM PASSAGEM,,ALTISSIMA,,N,,ALTISSIMA,NORTE
2,d55b6a81203f5a5ae6f362c483f36ff1,9e95be945b56ff10dc43df22381c9002,N,ALTA,A,LONGE,SEM PASSAGEM,S,ALTA,L,N,A,ALTA,SUL
3,f6bb36049c3b42a62dfa88c46e0f79d2,50502098d83c37b5fa60241bc4bd6641,N,ALTA,A,PROXIMO,SEM PASSAGEM,S,ALTISSIMA,,N,A,ALTISSIMA,NORTE
4,2755ed6330b4980ea24faa1081183dc7,aad24a7552d24b46188103f4249f17e7,N,MEDIA,,PROXIMO,SEM PASSAGEM,S,MEDIA,M,N,,MEDIA,NORDESTE


In [12]:
if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
else:
    # Threshold for a column to be considered an identifier (e.g., 95% unique values)
    ID_UNIQUENESS_THRESHOLD = 0.95
    # Threshold for a column to be considered low cardinality categorical
    LOW_CARDINALITY_THRESHOLD = 50

    identifier_cols = []
    binary_cols = []
    # A dictionary to store categorical columns and their number of unique values
    low_cardinality_categorical = {}

    total_rows = len(data_frame)

    for col in data_frame.columns:
        n_unique = data_frame[col].nunique()
        unique_ratio = n_unique / total_rows

        if unique_ratio >= ID_UNIQUENESS_THRESHOLD:
            identifier_cols.append(col)
        elif n_unique == 2:
            binary_cols.append(col)
        elif n_unique <= LOW_CARDINALITY_THRESHOLD:
            low_cardinality_categorical[col] = n_unique

    print("--- Identifier Columns ---")
    print(f"Found {len(identifier_cols)} columns with >= {ID_UNIQUENESS_THRESHOLD*100}% unique values.")
    print(identifier_cols)

    print("\n--- Binary Columns ---")
    print(f"Found {len(binary_cols)} columns with 2 unique values.")
    print(binary_cols)

    print("\n--- Low Cardinality Categorical Columns (<= {LOW_CARDINALITY_THRESHOLD} unique values) ---")
    print(f"Found {len(low_cardinality_categorical)} columns for inspection.")

    sorted_low_cardinality = dict(sorted(low_cardinality_categorical.items(), key=lambda item: item[1]))
    print(sorted_low_cardinality)

--- Identifier Columns ---
Found 2 columns with >= 95.0% unique values.
['id', 'documento']

--- Binary Columns ---
Found 11 columns with 2 unique values.
['alvo', 'VAR3', 'VAR31', 'VAR63', 'VAR78', 'VAR104', 'VAR114', 'VAR122', 'VAR124', 'VAR147', 'VAR188']

--- Low Cardinality Categorical Columns (<= {LOW_CARDINALITY_THRESHOLD} unique values) ---
Found 42 columns for inspection.
{'VAR6': 1, 'VAR76': 1, 'VAR117': 1, 'VAR127': 1, 'VAR140': 1, 'VAR153': 1, 'VAR163': 1, 'VAR2': 3, 'VAR41': 4, 'VAR167': 4, 'VAR180': 4, 'VAR185': 4, 'VAR62': 5, 'VAR93': 5, 'VAR100': 5, 'VAR103': 5, 'VAR112': 5, 'VAR199': 5, 'mes_ref': 6, 'VAR36': 6, 'VAR66': 6, 'VAR69': 6, 'VAR99': 6, 'VAR155': 6, 'VAR197': 7, 'VAR101': 8, 'VAR128': 8, 'VAR33': 9, 'VAR132': 9, 'VAR52': 11, 'VAR141': 11, 'VAR175': 11, 'VAR200': 12, 'VAR1': 13, 'VAR154': 13, 'VAR58': 15, 'VAR109': 18, 'VAR119': 19, 'VAR22': 28, 'VAR172': 31, 'VAR11': 44, 'VAR70': 45}


In [13]:
if 'data_frame' not in locals() or 'identifier_cols' not in locals():
    print("Error: Required variables ('data_frame', 'identifier_cols') not found. Please run the previous cells first.")
else:
    print(f"Original DataFrame shape: {data_frame.shape}")
    
    print(f"\nRemoving {len(identifier_cols)} identifier columns...")
    data_frame.drop(columns=identifier_cols, inplace=True)
    print(f"Columns removed: {identifier_cols}")
    
    nunique_counts = data_frame.nunique()
    cols_with_one_value = nunique_counts[nunique_counts == 1].index.tolist()
    
    if cols_with_one_value:
        print(f"\nFound and removing {len(cols_with_one_value)} columns with only one unique value...")
        data_frame.drop(columns=cols_with_one_value, inplace=True)
        print(f"Columns removed: {cols_with_one_value}")
    else:
        print("\nNo columns with a single unique value were found.")

    print("\n" + "="*50)
    print(f"Final DataFrame shape after removals: {data_frame.shape}")


Original DataFrame shape: (47732, 66)

Removing 2 identifier columns...
Columns removed: ['id', 'documento']

Found and removing 7 columns with only one unique value...
Columns removed: ['VAR6', 'VAR76', 'VAR117', 'VAR127', 'VAR140', 'VAR153', 'VAR163']

Final DataFrame shape after removals: (47732, 57)


In [14]:

if 'data_frame' not in locals():
    print("Error: The 'data_frame' variable was not found. Please run the previous cells first.")
elif 'alvo' not in data_frame.columns:
    print("Error: The target column 'alvo' was not found in the DataFrame.")
else:
    # 1. Numerical Analysis (Counts)
    print("--- Target Variable Distribution (Counts) ---")
    target_counts = data_frame['alvo'].value_counts()
    display(target_counts)

    # 2. Numerical Analysis (Percentages)
    print("\n--- Target Variable Distribution (Percentage) ---")
    target_percentages = data_frame['alvo'].value_counts(normalize=True) * 100
    display(target_percentages.map('{:.2f}%'.format))

--- Target Variable Distribution (Counts) ---


alvo
0.0    45746
1.0     1986
Name: count, dtype: int64


--- Target Variable Distribution (Percentage) ---


alvo
0.0    95.84%
1.0     4.16%
Name: proportion, dtype: object

# Análise de variáveis

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set a visually appealing theme for the plots
sns.set_theme(style="whitegrid")

# Ensure plots are displayed within the notebook
%matplotlib inline

print("Visualization libraries imported and theme set.")

Visualization libraries imported and theme set.


## Binárias

In [16]:
if 'data_frame' not in locals() or 'binary_cols' not in locals():
    print("Error: Required variables ('data_frame', 'binary_cols') not found. Please run the previous cells first.")
elif 'alvo' not in data_frame.columns:
    print("Error: The target column 'alvo' was not found in the DataFrame.")
else:
    print("--- Analyzing relationship between binary features and the 'alvo' target ---")
    
    for col in binary_cols:
        # We don't want to cross-tab the target with itself
        if col == 'alvo':
            continue

        print(f"\n" + "="*50)
        print(f"Analysis for Binary Column: '{col}'")
        print("="*50)

        # Crosstab with percentages
        print("\n[Percentage by Row]")
        crosstab_perc = pd.crosstab(data_frame[col], data_frame['alvo'], normalize='index') * 100
        display(crosstab_perc.map('{:.2f}%'.format))

--- Analyzing relationship between binary features and the 'alvo' target ---

Analysis for Binary Column: 'VAR3'

[Percentage by Row]


alvo,0.0,1.0
VAR3,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,96.57%,3.43%
1.0,60.92%,39.08%



Analysis for Binary Column: 'VAR31'

[Percentage by Row]


alvo,0.0,1.0
VAR31,Unnamed: 1_level_1,Unnamed: 2_level_1
N,96.09%,3.91%
S,97.50%,2.50%



Analysis for Binary Column: 'VAR63'

[Percentage by Row]


alvo,0.0,1.0
VAR63,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,95.17%,4.83%
1.0,95.46%,4.54%



Analysis for Binary Column: 'VAR78'

[Percentage by Row]


alvo,0.0,1.0
VAR78,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,96.35%,3.65%
1.0,52.56%,47.44%



Analysis for Binary Column: 'VAR104'

[Percentage by Row]


alvo,0.0,1.0
VAR104,Unnamed: 1_level_1,Unnamed: 2_level_1
N,95.85%,4.15%
S,96.07%,3.93%



Analysis for Binary Column: 'VAR114'

[Percentage by Row]


alvo,0.0,1.0
VAR114,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,95.88%,4.12%
1.0,95.90%,4.10%



Analysis for Binary Column: 'VAR122'

[Percentage by Row]


alvo,0.0,1.0
VAR122,Unnamed: 1_level_1,Unnamed: 2_level_1
N,95.79%,4.21%
S,95.99%,4.01%



Analysis for Binary Column: 'VAR124'

[Percentage by Row]


alvo,0.0,1.0
VAR124,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,96.64%,3.36%
1.0,64.57%,35.43%



Analysis for Binary Column: 'VAR147'

[Percentage by Row]


alvo,0.0,1.0
VAR147,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,95.61%,4.39%
1.0,66.76%,33.24%



Analysis for Binary Column: 'VAR188'

[Percentage by Row]


alvo,0.0,1.0
VAR188,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,95.90%,4.10%
1.0,94.49%,5.51%


In [17]:
binary_cols_prio = ['VAR3', 'VAR78', 'VAR124', 'VAR147']
binary_cols_low_prio = ['VAR31', 'VAR63', 'VAR104', 'VAR114', 'VAR122', 'VAR188']

In [18]:
# Analyze coverage and conditional default rate for priority binary columns

if 'data_frame' not in locals():
    print("Error: 'data_frame' not found. Please run previous cells.")
else:
    for col in binary_cols_prio:
        if col not in data_frame.columns:
            print(f"\nWarning: Column '{col}' not found. Skipping.")
            continue
            
        print(f"\n" + "="*50)
        print(f"Analysis for Priority Column: '{col}'")
        print("="*50)

        # 1. Calculate flag coverage (% of 1s)
        flag_coverage = data_frame[col].mean() * 100
        print(f"\nFlag Coverage (% of 1s): {flag_coverage:.2f}%")

        # 2. Calculate conditional default rate
        print("\nConditional Default Rate (Average of 'alvo'):")
        conditional_rate = data_frame.groupby(col)['alvo'].mean()
        display(conditional_rate.map('{:.2%}'.format))


Analysis for Priority Column: 'VAR3'

Flag Coverage (% of 1s): 3.14%

Conditional Default Rate (Average of 'alvo'):


VAR3
0.0     3.43%
1.0    39.08%
Name: alvo, dtype: object


Analysis for Priority Column: 'VAR78'

Flag Coverage (% of 1s): 2.05%

Conditional Default Rate (Average of 'alvo'):


VAR78
0.0     3.65%
1.0    47.44%
Name: alvo, dtype: object


Analysis for Priority Column: 'VAR124'

Flag Coverage (% of 1s): 3.70%

Conditional Default Rate (Average of 'alvo'):


VAR124
0.0     3.36%
1.0    35.43%
Name: alvo, dtype: object


Analysis for Priority Column: 'VAR147'

Flag Coverage (% of 1s): 1.28%

Conditional Default Rate (Average of 'alvo'):


VAR147
0.0     4.39%
1.0    33.24%
Name: alvo, dtype: object

In [None]:
# Analyze flag impact on subgroups

if 'data_frame' not in locals():
    print("Error: 'data_frame' not found. Please run previous cells.")
else:
    # Choose one of the priority flags to analyze within subgroups
    flag_to_analyze = 'VAR3'
    # ---------------------
    
    if flag_to_analyze not in data_frame.columns:
        print(f"Error: Chosen flag '{flag_to_analyze}' not in DataFrame.")
    else:
        # --- Subgroup by 'vlr_financiado' ---
        print(f"--- Analyzing '{flag_to_analyze}' across 'vlr_financiado' quantiles ---")
        
        # Discretize 'vlr_financiado' into 4 quantiles (quartiles)
        data_frame['vlr_financiado_quantile'] = pd.qcut(data_frame['vlr_financiado'], q=4, duplicates='drop')
        
        # Create a pivot table to see the default rate
        pivot_vlr = pd.pivot_table(data_frame, 
                                   values='alvo', 
                                   index='vlr_financiado_quantile', 
                                   columns=flag_to_analyze, 
                                   aggfunc='mean')
        
        display(pivot_vlr.style.format('{:.2%}').background_gradient(cmap='YlOrRd'))
        
        
        # --- Subgroup by 'mes_ref' ---
        print(f"\n\n--- Analyzing '{flag_to_analyze}' across 'mes_ref' ---")
        
        pivot_mes = pd.pivot_table(data_frame, 
                                   values='alvo', 
                                   index='mes_ref', 
                                   columns=flag_to_analyze, 
                                   aggfunc='mean')
        
        display(pivot_mes.style.format('{:.2%}').background_gradient(cmap='YlOrRd'))

--- Analyzing 'VAR3' across 'vlr_financiado' quantiles ---


  pivot_vlr = pd.pivot_table(data_frame,


VAR3,0.000000,1.000000
vlr_financiado_quantile,Unnamed: 1_level_1,Unnamed: 2_level_1
"(2427.879, 15742.62]",1.65%,17.83%
"(15742.62, 18836.81]",2.37%,17.86%
"(18836.81, 22276.5]",3.62%,38.33%
"(22276.5, 91283.65]",5.84%,53.99%




--- Analyzing 'VAR3' across 'mes_ref' ---


VAR3,0.000000,1.000000
mes_ref,Unnamed: 1_level_1,Unnamed: 2_level_1
202307,3.39%,40.35%
202308,3.88%,41.46%
202309,3.21%,36.05%
202310,3.23%,36.70%
202311,3.25%,47.22%
202312,3.57%,34.34%


In [None]:
# Analyze similarity between binary variables

if 'data_frame' not in locals() or 'binary_cols' not in locals():
    print("Error: Required variables ('data_frame', 'binary_cols') not found. Please run previous cells.")
else:
    # Select only the binary columns for this analysis
    binary_df = data_frame[binary_cols]

    # Initialize an empty DataFrame to store the similarity scores
    similarity_matrix = pd.DataFrame(index=binary_cols, columns=binary_cols, dtype=float)

    # Calculate the simple matching coefficient for each pair of columns
    for col1, col2 in combinations(binary_cols, 2):
        similarity = (binary_df[col1] == binary_df[col2]).mean()
        similarity_matrix.loc[col1, col2] = similarity
        similarity_matrix.loc[col2, col1] = similarity
    
    # Fill the diagonal with 1s, as a column is always 100% similar to itself
    np.fill_diagonal(similarity_matrix.values, 1.0)

    print("--- Similarity Matrix (Simple Matching Coefficient) ---")
    display(similarity_matrix)

    # --- Visualize the matrix as a heatmap ---
    print("\n--- Heatmap of Binary Variable Similarity ---")
    plt.figure(figsize=(12, 10))
    
    # Create the heatmap
    sns.heatmap(
        similarity_matrix, 
        annot=True,          # Show the similarity values on the map
        cmap='viridis',      # Color scheme
        fmt='.2f'            # Format numbers to 2 decimal places
    )
    
    plt.title('Similarity Matrix of Binary Variables', fontsize=16)
    plt.show()