In [1]:
import pandas as pd
import re
from imblearn.over_sampling import ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Import data
train = pd.read_csv("../data/train_complete.csv")
test = pd.read_csv("../data/test_complete.csv")

In [3]:
train["P1_i_2"]

0            Sudeste
1            Sudeste
2       Centro-oeste
3           Nordeste
4           Nordeste
            ...     
3796         Sudeste
3797         Sudeste
3798         Sudeste
3799        Nordeste
3800         Sudeste
Name: P1_i_2, Length: 3801, dtype: object

## Resampling

In [4]:
# Dataprep - toda transformação deve ser criada sobre o teste e somente aplicada sobre o treino
def apply_random_oversampling(df, oversample_col='', exclude_category=None):
    '''
    df: dataframe with all columns
    oversample_col: column to apply the oversampling
    exclude_category: list of categories on oversample_col to set apart while oversampling
    '''
    df_cat = pd.DataFrame(columns = df.columns)
    df_ = df.copy()
    
    # Separando categorias específica se necessário
    if exclude_category is not None:
        
        for cat in exclude_category:
            
            df_cat = pd.concat([df_cat, df[df[oversample_col]==cat]]).reset_index(drop=True)
            
            # removendo categorias que não serão oversampled
            df_ = df_[df[oversample_col]!=cat].reset_index()
    
    # Separando coluna de oversampling
    X = df_.drop(columns=[oversample_col])
    y = df_[oversample_col]

    # Aplicando Random Oversampling
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[oversample_col] = y_resampled

    # Exibir o DataFrame balanceado
    print(df_resampled[oversample_col].value_counts())
    
    # inserindo de volta as categorias que foram separadas
    if not df_cat.empty:
        df_resampled = pd.concat([df_resampled, df_cat]).reset_index(drop=True)
        
    return df_resampled

In [5]:
def apply_random_undersampling(df, undersample_col='', strategy={}):
    '''
    df: dataframe with all columns
    undersample_col: column to apply the undersampling
    srtategy: dictionary with the category as key and number of samples requested as value
    '''

    X = df.drop(columns=[undersample_col])
    y = df[undersample_col]

    # aplicando undersampling
    undersampler = RandomUnderSampler(sampling_strategy=strategy, random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[undersample_col] = y_resampled

    # Exibir o DataFrame balanceado e a distribuição de classes
    print(df_resampled[undersample_col].value_counts())
    
    return df_resampled

In [6]:
col_raca = 'P1_c'
col_genero = 'P1_b'
col_idade ='P1_a_1'
# col_regiao = 'P1_i_2';

cols_resampling = [col_raca, col_genero, col_idade ] # , col_regiao]

df_train_resampled = train.copy()

for col in cols_resampling:
    df_train_resampled = apply_random_oversampling(df_train_resampled, oversample_col=col)

  df_resampled[oversample_col] = y_resampled


P1_c
Branca                  2452
Preta                   2452
Parda                   2452
Amarela                 2452
Outra                   2452
Prefiro não informar    2452
Indígena                2452
Name: count, dtype: int64


  df_resampled[oversample_col] = y_resampled


P1_b
Masculino               12709
Feminino                12709
Prefiro não informar    12709
Outro                   12709
Name: count, dtype: int64
P1_a_1
25-29    15468
30-34    15468
40-44    15468
17-21    15468
35-39    15468
22-24    15468
45-49    15468
50-54    15468
55+      15468
Name: count, dtype: int64


  df_resampled[oversample_col] = y_resampled


In [7]:
len(train[[col for col in train.columns if "(OrdEnc)" in col]].columns)

6

In [8]:
onehot = ["P1_m", "P3_c","P4_a", "P4_a2", "P4_b", "P4_c", "P4_d", "P4_e", "P4_g", "P4_j", "P5_b", "P6_a", "P6_b", "P6_g", "P6_h", "P7_a", "P7_b", "P7_d", "P8_a", "P8_b", "P8_c", "P8_d", "P2_b", "P2_d", "P2_f", "P2_o", "P2_r", "P6_c", "P6_d", "P6_e", "P6_f"]
ordered = ["P1_l", "P2_e", "P2_g", "P2_i", "P2_j", "P2_h"]
sensible = ['P1_c', 'P1_b','P1_a_1','P1_i_2', 'P1_i_1']

# df_feat_selected = df_resampled[ [x +endswith("(OneHot)") for x in onehot ] + [x +endswith("(OrdEnc)") for x in ordered] ]


In [9]:
onehot_cols = [col for col in df_train_resampled.columns if any(col.startswith(c) for c in onehot + sensible) and (col.endswith('(OneHot)') or col.endswith('(FromList)'))]
ordered_cols = [col for col in df_train_resampled.columns if any(col.startswith(c) for c in ordered) and col.endswith('(OrdEnc)')]
df_train_selected = df_train_resampled[onehot_cols+ordered_cols]

len(ordered_cols)

6

In [10]:
df_train_selected[[col for col in df_train_selected.columns if "P1_c" in col]].head(1)

Unnamed: 0,P1_c_Amarela(OneHot),P1_c_Branca(OneHot),P1_c_Indígena(OneHot),P1_c_Outra(OneHot),P1_c_Parda(OneHot),P1_c_Prefiro não informar(OneHot),P1_c_Preta(OneHot)
0,False,True,False,False,False,False,False


In [11]:
# for sens_col in sensible:
#     df_feat_selected = pd.concat([df_feat_selected, pd.get_dummies(df_feat_selected[sens_col])], axis=1)

df_train_selected.columns

Index(['P2_o_1(FromList)', 'P2_o_2(FromList)', 'P2_o_3(FromList)',
       'P2_o_4(FromList)', 'P2_o_5(FromList)', 'P2_o_6(FromList)',
       'P2_o_7(FromList)', 'P2_o_8(FromList)', 'P2_o_9(FromList)',
       'P2_o_10(FromList)',
       ...
       'P6_g_Postgres/MySQL(OneHot)', 'P6_g_Presto(OneHot)',
       'P6_g_Snowflake(OneHot)', 'P6_g_Teradata(OneHot)', 'P1_l(OrdEnc)',
       'P2_e(OrdEnc)', 'P2_g(OrdEnc)', 'P2_i(OrdEnc)', 'P2_j(OrdEnc)',
       'P2_h(OrdEnc)'],
      dtype='object', length=335)

In [12]:
test_feat_selected = test[onehot_cols+ordered_cols]

# for sens_col in sensible:
#     test_feat_selected = pd.concat([test_feat_selected, pd.get_dummies(test_feat_selected[sens_col])], axis=1)

In [13]:
X_train = df_train_selected.drop(columns=['P2_h(OrdEnc)'])
y_train = df_train_selected['P2_h(OrdEnc)']

X_test = test_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_test = test_feat_selected['P2_h(OrdEnc)']

# Teste
# gambiarra porque no teste não tinha nenhuma classe indígena, então a coluna não foi criada no get dummies
X_test['Indígena'] = 0

# Preserve order
X_test = X_test[X_train.columns]

In [14]:
print(len(X_train.columns))
print(len(X_test.columns))

X_train.columns

334
334


Index(['P2_o_1(FromList)', 'P2_o_2(FromList)', 'P2_o_3(FromList)',
       'P2_o_4(FromList)', 'P2_o_5(FromList)', 'P2_o_6(FromList)',
       'P2_o_7(FromList)', 'P2_o_8(FromList)', 'P2_o_9(FromList)',
       'P2_o_10(FromList)',
       ...
       'P6_g_Oracle(OneHot)', 'P6_g_Postgres/MySQL(OneHot)',
       'P6_g_Presto(OneHot)', 'P6_g_Snowflake(OneHot)',
       'P6_g_Teradata(OneHot)', 'P1_l(OrdEnc)', 'P2_e(OrdEnc)', 'P2_g(OrdEnc)',
       'P2_i(OrdEnc)', 'P2_j(OrdEnc)'],
      dtype='object', length=334)

In [15]:
# See if order of train and test columns are exactly the same
assert (X_train.columns == X_test.columns).all()

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [34]:
X_train[[col for col in X_train.columns if "P1_c" in col]]

Unnamed: 0,P1_c_Amarela(OneHot),P1_c_Branca(OneHot),P1_c_Indígena(OneHot),P1_c_Outra(OneHot),P1_c_Parda(OneHot),P1_c_Prefiro não informar(OneHot),P1_c_Preta(OneHot)
0,False,True,False,False,False,False,False
1,False,False,False,False,False,False,True
2,False,True,False,False,False,False,False
3,False,True,False,False,False,False,False
4,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...
139207,True,False,False,False,False,False,False
139208,False,True,False,False,False,False,False
139209,False,False,False,False,True,False,False
139210,False,True,False,False,False,False,False


In [18]:
# Logistic Regression
logistic_regression = LogisticRegression(max_iter=100)
logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [20]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

In [21]:
# Fit the QDA model with the balanced dataset
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)



In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
set(X_train.columns)-set(X_test.columns)

set()

In [24]:
X_train.columns

Index(['P2_o_1(FromList)', 'P2_o_2(FromList)', 'P2_o_3(FromList)',
       'P2_o_4(FromList)', 'P2_o_5(FromList)', 'P2_o_6(FromList)',
       'P2_o_7(FromList)', 'P2_o_8(FromList)', 'P2_o_9(FromList)',
       'P2_o_10(FromList)',
       ...
       'P6_g_Oracle(OneHot)', 'P6_g_Postgres/MySQL(OneHot)',
       'P6_g_Presto(OneHot)', 'P6_g_Snowflake(OneHot)',
       'P6_g_Teradata(OneHot)', 'P1_l(OrdEnc)', 'P2_e(OrdEnc)', 'P2_g(OrdEnc)',
       'P2_i(OrdEnc)', 'P2_j(OrdEnc)'],
      dtype='object', length=334)

In [25]:
X_test.columns

Index(['P2_o_1(FromList)', 'P2_o_2(FromList)', 'P2_o_3(FromList)',
       'P2_o_4(FromList)', 'P2_o_5(FromList)', 'P2_o_6(FromList)',
       'P2_o_7(FromList)', 'P2_o_8(FromList)', 'P2_o_9(FromList)',
       'P2_o_10(FromList)',
       ...
       'P6_g_Oracle(OneHot)', 'P6_g_Postgres/MySQL(OneHot)',
       'P6_g_Presto(OneHot)', 'P6_g_Snowflake(OneHot)',
       'P6_g_Teradata(OneHot)', 'P1_l(OrdEnc)', 'P2_e(OrdEnc)', 'P2_g(OrdEnc)',
       'P2_i(OrdEnc)', 'P2_j(OrdEnc)'],
      dtype='object', length=334)

In [26]:
cols_when_model_fited = logistic_regression.feature_names_in_

set(cols_when_model_fited) - set(X_test[X_train.columns])

set()

In [27]:
models = {
    'QDA': qda,
    'Logistic Regression': logistic_regression,
    'KNN': knn,
    'Decision Tree': decision_tree
}

cols_when_model_fited = logistic_regression.feature_names_in_

results = []
for model_name, model in models.items():
    #X_test_aligned = X_test[X_train.columns]
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, zero_division=0)
    results.append([model_name, accuracy, precision, recall, f1])

# Store the results in a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Print the results
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,QDA,0.233438,"[0.0, 0.0, 0.0, 0.0, 0.256198347107438, 0.1468...","[0.0, 0.0, 0.0, 0.0, 0.20945945945945946, 0.33...","[0.0, 0.0, 0.0, 0.0, 0.23048327137546468, 0.20..."
1,Logistic Regression,0.260778,"[0.0, 0.23529411764705882, 0.18461538461538463...","[0.0, 0.20512820512820512, 0.19047619047619047...","[0.0, 0.2191780821917808, 0.1875, 0.2278481012..."
2,KNN,0.26183,"[0.5, 0.2127659574468085, 0.2222222222222222, ...","[0.2222222222222222, 0.2564102564102564, 0.253...","[0.3076923076923077, 0.23255813953488372, 0.23..."
3,Decision Tree,0.24816,"[0.2, 0.21875, 0.2028985507246377, 0.106060606...","[0.1111111111111111, 0.1794871794871795, 0.222...","[0.14285714285714285, 0.19718309859154928, 0.2..."


In [39]:
# Define sens_col
sens_col = 'P1_b_Feminino(OneHot)'

# Use proper boolean indexing
test = X_test[X_test[sens_col] == True]

test[sens_col]


3      True
8      True
10     True
22     True
24     True
       ... 
927    True
930    True
932    True
935    True
938    True
Name: P1_b_Feminino(OneHot), Length: 213, dtype: bool

In [42]:
# Teste POR CLASSE SENSÍVEL
model = logistic_regression

onehot_sensible_cols = [col for col in df_train_resampled.columns if any(col.startswith(c) for c in onehot + sensible) and (col.endswith('(OneHot)') or col.endswith('(FromList)'))]

for sens_col in onehot_sensible_cols:
    X_test_filtered = X_test[X_test[sens_col] == True]
    y_test_filtered = y_test[X_test[sens_col] == True]
    y_pred = model.predict(X_test_filtered)
    accuracy = accuracy_score(y_test_filtered, y_pred)
    f1 = f1_score(y_test_filtered, y_pred, average='macro', zero_division=0)
    print(f"Classe {sens_col}: acc={accuracy} / f1={f1}\n")


Classe P2_o_1(FromList): acc=0.2626788036410923 / f1=0.1909191198527413

Classe P2_o_2(FromList): acc=0.2875536480686695 / f1=0.1538532568795727

Classe P2_o_3(FromList): acc=0.2535211267605634 / f1=0.2135007259654243

Classe P2_o_4(FromList): acc=0.26465364120781526 / f1=0.16551379617935266

Classe P2_o_5(FromList): acc=0.28823529411764703 / f1=0.21502644068568144

Classe P2_o_6(FromList): acc=0.23786407766990292 / f1=0.17974650048953456

Classe P2_o_7(FromList): acc=0.2621359223300971 / f1=0.21709918466264314

Classe P2_o_8(FromList): acc=0.22131147540983606 / f1=0.18137060784371997

Classe P2_o_9(FromList): acc=0.2835820895522388 / f1=0.1848013524936602

Classe P2_o_10(FromList): acc=0.22807017543859648 / f1=0.15978287010895706

Classe P3_c_1(FromList): acc=0.21929824561403508 / f1=0.15095330044773925

Classe P3_c_2(FromList): acc=0.23863636363636365 / f1=0.17214912280701755

Classe P3_c_3(FromList): acc=0.1927710843373494 / f1=0.1506977992485239

Classe P3_c_4(FromList): acc=0.2027

ValueError: Found array with 0 sample(s) (shape=(0, 334)) while a minimum of 1 is required by LogisticRegression.