In [1]:
import pandas as pd
import re
from imblearn.over_sampling import ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler



In [2]:
# Import data
train = pd.read_csv("../data/train_complete.csv")
test = pd.read_csv("../data/test_complete.csv")

## Resampling

In [3]:
# Dataprep - toda transformação deve ser criada sobre o teste e somente aplicada sobre o treino
def apply_random_oversampling(df, oversample_col='', exclude_category=None):
    '''
    df: dataframe with all columns
    oversample_col: column to apply the oversampling
    exclude_category: list of categories on oversample_col to set apart while oversampling
    '''
    df_cat = pd.DataFrame(columns = df.columns)
    df_ = df.copy()
    
    # Separando categorias específica se necessário
    if exclude_category is not None:
        
        for cat in exclude_category:
            
            df_cat = pd.concat([df_cat, df[df[oversample_col]==cat]]).reset_index(drop=True)
            
            # removendo categorias que não serão oversampled
            df_ = df_[df[oversample_col]!=cat].reset_index()
    
    # Separando coluna de oversampling
    X = df_.drop(columns=[oversample_col])
    y = df_[oversample_col]

    # Aplicando Random Oversampling
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[oversample_col] = y_resampled

    # Exibir o DataFrame balanceado
    print(df_resampled[oversample_col].value_counts())
    
    # inserindo de volta as categorias que foram separadas
    if not df_cat.empty:
        df_resampled = pd.concat([df_resampled, df_cat]).reset_index(drop=True)
        
    return df_resampled

In [4]:
def apply_random_undersampling(df, undersample_col='', strategy={}):
    '''
    df: dataframe with all columns
    undersample_col: column to apply the undersampling
    srtategy: dictionary with the category as key and number of samples requested as value
    '''

    X = df.drop(columns=[undersample_col])
    y = df[undersample_col]

    # aplicando undersampling
    undersampler = RandomUnderSampler(sampling_strategy=strategy, random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[undersample_col] = y_resampled

    # Exibir o DataFrame balanceado e a distribuição de classes
    print(df_resampled[undersample_col].value_counts())
    
    return df_resampled

In [5]:
train.columns

Index(['P0', 'P1_a', 'P1_a_1', 'P1_b', 'P1_c', 'P1_d', 'P1_e', 'P1_e_1',
       'P1_e_2', 'P1_e_3',
       ...
       'P6_g_Presto(OneHot)', 'P6_g_Snowflake(OneHot)',
       'P6_g_Teradata(OneHot)', 'P6_g_nan(OneHot)', 'P1_l(OrdEnc)',
       'P2_e(OrdEnc)', 'P2_g(OrdEnc)', 'P2_i(OrdEnc)', 'P2_j(OrdEnc)',
       'P2_h(OrdEnc)'],
      dtype='object', length=724)

In [6]:
col_raca = 'P1_c'
col_genero = 'P1_b'
col_idade ='P1_a_1'
# col_regiao = 'P1_i_2'

cols_resampling = [col_raca, col_genero, col_idade ] # , col_regiao]

df_resampled = train.copy()

for col in cols_resampling:
    df_resampled = apply_random_oversampling(df_resampled, oversample_col=col)

  df_resampled[oversample_col] = y_resampled


Branca                  2452
Preta                   2452
Parda                   2452
Amarela                 2452
Outra                   2452
Prefiro não informar    2452
Indígena                2452
Name: P1_c, dtype: int64


  df_resampled[oversample_col] = y_resampled


Masculino               12709
Feminino                12709
Prefiro não informar    12709
Outro                   12709
Name: P1_b, dtype: int64
25-29    15468
30-34    15468
40-44    15468
17-21    15468
35-39    15468
22-24    15468
45-49    15468
50-54    15468
55+      15468
Name: P1_a_1, dtype: int64


  df_resampled[oversample_col] = y_resampled


In [7]:
print(df_resampled.shape)
print(df_resampled.info())

(139212, 724)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139212 entries, 0 to 139211
Columns: 724 entries, P0 to P1_a_1
dtypes: bool(638), int64(7), object(79)
memory usage: 176.0+ MB
None


In [8]:
# !pip install seaborn

In [9]:
onehot = ["P1_m", "P3_c","P4_a","P4_b", "P4_c", "P4_d", "P4_e", "P4_g", "P4_j", "P5_b", "P6_a", "P6_b", "P6_g", "P6_h", "P7_a", "P7_b", "P7_d", "P8_a", "P8_b", "P8_c", "P8_d", "P2_b", "P2_d", "P2_f", "P2_o", "P2_r"]
ordered = ["P1_l", "P2_e", "P2_g", "P2_i", "P2_j", "P2_h"]

# df_feat_selected = df_resampled[ [x +endswith("(OneHot)") for x in onehot ] + [x +endswith("(OrdEnc)") for x in ordered] ]


In [10]:
import re

# Define the patterns for filtering the columns
onehot_pattern = f"^({'|'.join(onehot)}).*\\(OneHot\\)$"
ordered_pattern = f"^({'|'.join(ordered)}).*\\(OrdEnc\\)$"

# Apply the filtering to select the matching columns
df_feat_selected = df_resampled[
    df_resampled.columns[df_resampled.columns.str.contains(onehot_pattern) | 
                         df_resampled.columns.str.contains(ordered_pattern)]
]

  df_resampled.columns[df_resampled.columns.str.contains(onehot_pattern) |
  df_resampled.columns.str.contains(ordered_pattern)]


In [11]:
df_feat_selected.head()

Unnamed: 0,P1_m_Ciências Biológicas/ Farmácia/ Medicina/ Área da Saúde(OneHot),P1_m_Ciências Sociais(OneHot),P1_m_Computação / Engenharia de Software / Sistemas de Informação/ TI(OneHot),P1_m_Economia/ Administração / Contabilidade / Finanças/ Negócios(OneHot),P1_m_Estatística/ Matemática / Matemática Computacional/ Ciências Atuariais(OneHot),P1_m_Marketing / Publicidade / Comunicação / Jornalismo(OneHot),P1_m_Outra opção(OneHot),P1_m_Outras Engenharias(OneHot),P1_m_Química / Física(OneHot),P1_m_nan(OneHot),...,P6_g_Presto(OneHot),P6_g_Snowflake(OneHot),P6_g_Teradata(OneHot),P6_g_nan(OneHot),P1_l(OrdEnc),P2_e(OrdEnc),P2_g(OrdEnc),P2_i(OrdEnc),P2_j(OrdEnc),P2_h(OrdEnc)
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,5,4,-1,6,2,10
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,4,0,-1,6,6,9
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,6,2,-1,4,1,9
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,1,-1,1,2,1,5
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,4,-1,1,2,0,3


In [12]:
# count how many -1 there are in the column P2_h(OrdEnc)
print(df_feat_selected['P2_h(OrdEnc)'].value_counts())

7     25041
6     22156
4     18550
3     12444
5     11111
1     10586
9      8903
11     8127
2      8126
12     5912
8      5767
10     2269
0       220
Name: P2_h(OrdEnc), dtype: int64


In [13]:
test_feat_selected = test[
    test.columns[test.columns.str.contains(onehot_pattern) | 
                 test.columns.str.contains(ordered_pattern)]]

  test.columns[test.columns.str.contains(onehot_pattern) |
  test.columns.str.contains(ordered_pattern)]]


In [14]:
df_feat_selected.head()

Unnamed: 0,P1_m_Ciências Biológicas/ Farmácia/ Medicina/ Área da Saúde(OneHot),P1_m_Ciências Sociais(OneHot),P1_m_Computação / Engenharia de Software / Sistemas de Informação/ TI(OneHot),P1_m_Economia/ Administração / Contabilidade / Finanças/ Negócios(OneHot),P1_m_Estatística/ Matemática / Matemática Computacional/ Ciências Atuariais(OneHot),P1_m_Marketing / Publicidade / Comunicação / Jornalismo(OneHot),P1_m_Outra opção(OneHot),P1_m_Outras Engenharias(OneHot),P1_m_Química / Física(OneHot),P1_m_nan(OneHot),...,P6_g_Presto(OneHot),P6_g_Snowflake(OneHot),P6_g_Teradata(OneHot),P6_g_nan(OneHot),P1_l(OrdEnc),P2_e(OrdEnc),P2_g(OrdEnc),P2_i(OrdEnc),P2_j(OrdEnc),P2_h(OrdEnc)
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,5,4,-1,6,2,10
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,4,0,-1,6,6,9
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,6,2,-1,4,1,9
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,1,-1,1,2,1,5
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,4,-1,1,2,0,3


In [15]:
X_train = df_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_train = df_feat_selected['P2_h(OrdEnc)']

X_test = test_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_test = test_feat_selected['P2_h(OrdEnc)']

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [17]:
# Logistic Regression
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [18]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [19]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [20]:
# Fit the QDA model with the balanced dataset
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)



QuadraticDiscriminantAnalysis()

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
models = {
    'QDA': qda,
    'Logistic Regression': logistic_regression,
    'KNN': knn,
    'Decision Tree': decision_tree
}

results = []
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, zero_division=0)
    results.append([model_name, accuracy, precision, recall, f1])

# Store the results in a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Print the results
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,QDA,0.094637,"[0.0, 0.0, 0.16176470588235295, 0.129629629629...","[0.0, 0.0, 0.3492063492063492, 0.4242424242424...","[0.0, 0.0, 0.22110552763819097, 0.198581560283..."
1,Logistic Regression,0.233438,"[0.0, 0.3170731707317073, 0.21794871794871795,...","[0.0, 0.3333333333333333, 0.2698412698412698, ...","[0.0, 0.32499999999999996, 0.24113475177304963..."
2,KNN,0.242902,"[0.09090909090909091, 0.2127659574468085, 0.17...","[0.1111111111111111, 0.2564102564102564, 0.158...","[0.09999999999999999, 0.23255813953488375, 0.1..."
3,Decision Tree,0.24816,"[0.0, 0.2608695652173913, 0.2692307692307692, ...","[0.0, 0.3076923076923077, 0.2222222222222222, ...","[0.0, 0.2823529411764706, 0.2434782608695652, ..."
