In [1]:
import pandas as pd
import re
from imblearn.over_sampling import ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler



In [2]:
# Import data
train = pd.read_csv("../data/train_complete.csv")
test = pd.read_csv("../data/test_complete.csv")

## Resampling

In [3]:
# Dataprep - toda transformação deve ser criada sobre o teste e somente aplicada sobre o treino
def apply_random_oversampling(df, oversample_col='', exclude_category=None):
    '''
    df: dataframe with all columns
    oversample_col: column to apply the oversampling
    exclude_category: list of categories on oversample_col to set apart while oversampling
    '''
    df_cat = pd.DataFrame(columns = df.columns)
    df_ = df.copy()
    
    # Separando categorias específica se necessário
    if exclude_category is not None:
        
        for cat in exclude_category:
            
            df_cat = pd.concat([df_cat, df[df[oversample_col]==cat]]).reset_index(drop=True)
            
            # removendo categorias que não serão oversampled
            df_ = df_[df[oversample_col]!=cat].reset_index()
    
    # Separando coluna de oversampling
    X = df_.drop(columns=[oversample_col])
    y = df_[oversample_col]

    # Aplicando Random Oversampling
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[oversample_col] = y_resampled

    # Exibir o DataFrame balanceado
    print(df_resampled[oversample_col].value_counts())
    
    # inserindo de volta as categorias que foram separadas
    if not df_cat.empty:
        df_resampled = pd.concat([df_resampled, df_cat]).reset_index(drop=True)
        
    return df_resampled

In [4]:
def apply_random_undersampling(df, undersample_col='', strategy={}):
    '''
    df: dataframe with all columns
    undersample_col: column to apply the undersampling
    srtategy: dictionary with the category as key and number of samples requested as value
    '''

    X = df.drop(columns=[undersample_col])
    y = df[undersample_col]

    # aplicando undersampling
    undersampler = RandomUnderSampler(sampling_strategy=strategy, random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[undersample_col] = y_resampled

    # Exibir o DataFrame balanceado e a distribuição de classes
    print(df_resampled[undersample_col].value_counts())
    
    return df_resampled

In [5]:
train.columns

Index(['P0', 'P1_a', 'P1_a_1', 'P1_b', 'P1_c', 'P1_d', 'P1_e', 'P1_e_1',
       'P1_e_2', 'P1_e_3',
       ...
       'P6_g_Presto(OneHot)', 'P6_g_Snowflake(OneHot)',
       'P6_g_Teradata(OneHot)', 'P6_g_nan(OneHot)', 'P1_l(OrdEnc)',
       'P2_e(OrdEnc)', 'P2_g(OrdEnc)', 'P2_i(OrdEnc)', 'P2_j(OrdEnc)',
       'P2_h(OrdEnc)'],
      dtype='object', length=724)

In [6]:
col_raca = 'P1_c'
col_genero = 'P1_b'
col_idade ='P1_a_1'
# col_regiao = 'P1_i_2';

cols_resampling = [col_raca, col_genero, col_idade ] # , col_regiao]

df_resampled = train.copy()

for col in cols_resampling:
    df_resampled = apply_random_oversampling(df_resampled, oversample_col=col)

  df_resampled[oversample_col] = y_resampled


Branca                  2452
Preta                   2452
Parda                   2452
Amarela                 2452
Outra                   2452
Prefiro não informar    2452
Indígena                2452
Name: P1_c, dtype: int64


  df_resampled[oversample_col] = y_resampled


Masculino               12709
Feminino                12709
Prefiro não informar    12709
Outro                   12709
Name: P1_b, dtype: int64
25-29    15468
30-34    15468
40-44    15468
17-21    15468
35-39    15468
22-24    15468
45-49    15468
50-54    15468
55+      15468
Name: P1_a_1, dtype: int64


  df_resampled[oversample_col] = y_resampled


In [7]:
onehot = ["P1_m", "P3_c","P4_a","P4_b", "P4_c", "P4_d", "P4_e", "P4_g", "P4_j", "P5_b", "P6_a", "P6_b", "P6_g", "P6_h", "P7_a", "P7_b", "P7_d", "P8_a", "P8_b", "P8_c", "P8_d", "P2_b", "P2_d", "P2_f", "P2_o", "P2_r", "P6_c", "P6_d", "P6_e", "P6_f"]
ordered = ["P1_l", "P2_e", "P2_g", "P2_i", "P2_j", "P2_h"]
sensible = ['P1_c', 'P1_b','P1_a_1','P1_i_2']

# df_feat_selected = df_resampled[ [x +endswith("(OneHot)") for x in onehot ] + [x +endswith("(OrdEnc)") for x in ordered] ]


In [9]:
onehot_cols = [col for col in df_resampled.columns if any(col.startswith(c) for c in onehot) and col.endswith('(OneHot)')]
ordered_cols = [col for col in df_resampled.columns if any(col.startswith(c) for c in ordered) and col.endswith('(OrdEnc)')]
df_feat_selected = df_resampled[onehot_cols+ordered_cols+sensible]

In [10]:
for sens_col in sensible:
    df_feat_selected = pd.concat([df_feat_selected, pd.get_dummies(df_feat_selected[sens_col])], axis=1)

df_feat_selected.columns

Index(['P1_m_Ciências Biológicas/ Farmácia/ Medicina/ Área da Saúde(OneHot)',
       'P1_m_Ciências Sociais(OneHot)',
       'P1_m_Computação / Engenharia de Software / Sistemas de Informação/ TI(OneHot)',
       'P1_m_Economia/ Administração / Contabilidade / Finanças/ Negócios(OneHot)',
       'P1_m_Estatística/ Matemática / Matemática Computacional/ Ciências Atuariais(OneHot)',
       'P1_m_Marketing / Publicidade / Comunicação / Jornalismo(OneHot)',
       'P1_m_Outra opção(OneHot)', 'P1_m_Outras Engenharias(OneHot)',
       'P1_m_Química / Física(OneHot)', 'P1_m_nan(OneHot)',
       ...
       '35-39', '40-44', '45-49', '50-54', '55+', 'Centro-oeste', 'Nordeste',
       'Norte', 'Sudeste', 'Sul'],
      dtype='object', length=122)

In [20]:
test_feat_selected = test[onehot_cols+ordered_cols+sensible]

for sens_col in sensible:
    test_feat_selected = pd.concat([test_feat_selected, pd.get_dummies(test_feat_selected[sens_col])], axis=1)


In [22]:
X_train = df_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_train = df_feat_selected['P2_h(OrdEnc)']

X_test = test_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_test = test_feat_selected['P2_h(OrdEnc)']

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [14]:
# Logistic Regression
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train.drop(columns=sensible), y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [15]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train.drop(columns=sensible), y_train)

KNeighborsClassifier()

In [16]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train.drop(columns=sensible), y_train)

DecisionTreeClassifier()

In [17]:
# Fit the QDA model with the balanced dataset
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train.drop(columns=sensible), y_train)



QuadraticDiscriminantAnalysis()

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [24]:
set(X_train.columns)-set(X_test.columns)

{'Indígena'}

In [25]:
# Teste
# gambiarra porque no teste não tinha nenhuma classe indígena, então a coluna não foi criada no get dummies
X_test['Indígena'] = 0

#
models = {
    'QDA': qda,
    'Logistic Regression': logistic_regression,
    'KNN': knn,
    'Decision Tree': decision_tree
}

results = []
for model_name, model in models.items():
    y_pred = model.predict(X_test.drop(columns=sensible))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, zero_division=0)
    results.append([model_name, accuracy, precision, recall, f1])

# Store the results in a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Print the results
results_df

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,QDA,0.157729,"[0.0, 0.0, 0.06666666666666667, 0.082872928176...","[0.0, 0.0, 0.12698412698412698, 0.227272727272...","[0.0, 0.0, 0.08743169398907104, 0.121457489878..."
1,Logistic Regression,0.249211,"[0.0, 0.2191780821917808, 0.16666666666666666,...","[0.0, 0.41025641025641024, 0.19047619047619047...","[0.0, 0.2857142857142857, 0.17777777777777778,..."
2,KNN,0.242902,"[0.08333333333333333, 0.1206896551724138, 0.17...","[0.1111111111111111, 0.1794871794871795, 0.190...","[0.09523809523809525, 0.1443298969072165, 0.18..."
3,Decision Tree,0.222923,"[0.6666666666666666, 0.2727272727272727, 0.180...","[0.2222222222222222, 0.3076923076923077, 0.206...","[0.3333333333333333, 0.2891566265060241, 0.192..."


In [27]:
# Teste POR CLASSE SENSÍVEL
model =  logistic_regression

for sens_col in sensible:
    print(f"COLUNA {sens_col}")
    for c in X_test[sens_col].unique():
        y_pred = model.predict(X_test[X_test[sens_col]==c].drop(columns=sensible))
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        print("Classe {c}: acc={accuraccy} / f1={f1}")


COLUNA P1_c


Feature names must be in the same order as they were in fit.



ValueError: Found input variables with inconsistent numbers of samples: [951, 222]