In [23]:
import pandas as pd
import re
from imblearn.over_sampling import ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [24]:
# Import data
train = pd.read_csv("../data/train_complete.csv")
test = pd.read_csv("../data/test_complete.csv")

## Resampling

In [25]:
# Dataprep - toda transformação deve ser criada sobre o teste e somente aplicada sobre o treino
def apply_random_oversampling(df, oversample_col='', exclude_category=None):
    '''
    df: dataframe with all columns
    oversample_col: column to apply the oversampling
    exclude_category: list of categories on oversample_col to set apart while oversampling
    '''
    df_cat = pd.DataFrame(columns = df.columns)
    df_ = df.copy()
    
    # Separando categorias específica se necessário
    if exclude_category is not None:
        
        for cat in exclude_category:
            
            df_cat = pd.concat([df_cat, df[df[oversample_col]==cat]]).reset_index(drop=True)
            
            # removendo categorias que não serão oversampled
            df_ = df_[df[oversample_col]!=cat].reset_index()
    
    # Separando coluna de oversampling
    X = df_.drop(columns=[oversample_col])
    y = df_[oversample_col]

    # Aplicando Random Oversampling
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[oversample_col] = y_resampled

    # Exibir o DataFrame balanceado
    print(df_resampled[oversample_col].value_counts())
    
    # inserindo de volta as categorias que foram separadas
    if not df_cat.empty:
        df_resampled = pd.concat([df_resampled, df_cat]).reset_index(drop=True)
        
    return df_resampled

In [26]:
def apply_random_undersampling(df, undersample_col='', strategy={}):
    '''
    df: dataframe with all columns
    undersample_col: column to apply the undersampling
    srtategy: dictionary with the category as key and number of samples requested as value
    '''

    X = df.drop(columns=[undersample_col])
    y = df[undersample_col]

    # aplicando undersampling
    undersampler = RandomUnderSampler(sampling_strategy=strategy, random_state=42)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[undersample_col] = y_resampled

    # Exibir o DataFrame balanceado e a distribuição de classes
    print(df_resampled[undersample_col].value_counts())
    
    return df_resampled

In [27]:
# col_raca = 'P1_c'
# col_genero = 'P1_b'
# col_idade ='P1_a_1'
# # col_regiao = 'P1_i_2'

# cols_resampling = [col_raca, col_genero, col_idade ] # , col_regiao]

# df_resampled = train.copy()

# for col in cols_resampling:
#     df_resampled = apply_random_oversampling(df_resampled, oversample_col=col)

## Teste pulando oversampling

In [28]:
df_resampled = train.copy()

In [29]:
# !pip install seaborn

In [30]:
onehot = ["P1_m", "P3_c","P4_a","P4_b", "P4_c", "P4_d", "P4_e", "P4_g", "P4_j", "P5_b", "P6_a", "P6_b", "P6_g", "P6_h", "P7_a", "P7_b", "P7_d", "P8_a", "P8_b", "P8_c", "P8_d", "P2_b", "P2_d", "P2_f", "P2_o", "P2_r", "P6_c", "P6_d", "P6_e", "P6_f"]
ordered = ["P1_l", "P2_e", "P2_g", "P2_i", "P2_j", "P2_h"]


# df_feat_selected = df_resampled[ [x +endswith("(OneHot)") for x in onehot ] + [x +endswith("(OrdEnc)") for x in ordered] ]


In [31]:

onehot_cols = [col for col in df_resampled.columns if any(col.startswith(c) for c in onehot) and (col.endswith('(OneHot)') or col.endswith('(FromList)'))]
ordered_cols = [col for col in df_resampled.columns if any(col.startswith(c) for c in ordered) and col.endswith('(OrdEnc)')]
df_feat_selected = df_resampled[onehot_cols+ordered_cols]

In [32]:
df_feat_selected.head()

Unnamed: 0,P2_o_1(FromList),P2_o_2(FromList),P2_o_3(FromList),P2_o_4(FromList),P2_o_5(FromList),P2_o_6(FromList),P2_o_7(FromList),P2_o_8(FromList),P2_o_9(FromList),P2_o_10(FromList),...,P6_g_Postgres/MySQL(OneHot),P6_g_Presto(OneHot),P6_g_Snowflake(OneHot),P6_g_Teradata(OneHot),P1_l(OrdEnc),P2_e(OrdEnc),P2_g(OrdEnc),P2_i(OrdEnc),P2_j(OrdEnc),P2_h(OrdEnc)
0,True,False,True,False,False,True,False,False,False,False,...,False,False,False,False,5,5,0,6,2,10
1,False,False,False,True,False,True,False,False,False,True,...,False,False,False,False,4,1,0,6,6,9
2,True,True,False,True,False,False,False,False,False,False,...,False,False,False,False,6,3,0,4,1,9
3,True,False,False,True,False,False,True,False,False,False,...,False,False,False,False,1,0,2,2,1,5
4,True,False,False,True,False,True,False,False,False,False,...,False,False,False,False,4,0,2,2,0,3


In [33]:
test_feat_selected = test[onehot_cols+ordered_cols]

In [34]:
X_train = df_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_train = df_feat_selected['P2_h(OrdEnc)']

X_test = test_feat_selected.drop(columns=['P2_h(OrdEnc)'])
y_test = test_feat_selected['P2_h(OrdEnc)']

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [36]:
# Logistic Regression
params_log_reg={
    'solver': ['lbfgs', 'saga', 'newton-cg', 'liblinear'],
    'C': [0.01, 0.1, 1.0, 10.0],
    'multi_class': ['multinomial', 'ovr'],
    'max_iter': [100, 200, 500],
    'penalty': ['l2'],  # 'l1' pode ser testado com 'liblinear' ou 'saga'
    'tol': [1e-4, 1e-3, 1e-2]
}

log_reg = LogisticRegression()

# Configurar RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=params_log_reg,
    n_iter=10,  # Número de combinações a testar
    scoring='accuracy',  # Métrica de avaliação
    cv=5,  # Número de folds da validação cruzada
    random_state=42,
    n_jobs=-1  # Usar todos os núcleos disponíveis
)

random_search.fit(X_train, y_train)

print("Melhores hiperparâmetros encontrados:")
print(random_search.best_params_)

print("\nAcurácia obtida:")
print(random_search.best_score_)

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Software\Anaconda\envs\mo436b\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Software\Anaconda\envs\mo436b\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Software\Anaconda\envs\mo436b\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1267, in fit
    multi_class = _check_multi_class(multi_class, solver, len(self.classes_))
                  ^^

Melhores hiperparâmetros encontrados:
{'tol': 0.0001, 'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'ovr', 'max_iter': 200, 'C': 0.01}

Acurácia obtida:
0.31596963828757174


In [37]:
# lr_params = {'tol': 0.0001, 'solver': 'saga', 'penalty': 'l2', 'multi_class': 'multinomial', 'max_iter': 500, 'C': 1.0}
log_reg = LogisticRegression(tol= 0.0001, solver='saga', penalty='l2', multi_class='multinomial', max_iter=500, C=1.0)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None, zero_division=0)
recall = recall_score(y_test, y_pred, average=None, zero_division=0)
f1 = f1_score(y_test, y_pred, average=None, zero_division=0)
print(f'Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score:{f1}')



Accuracy: 0.31230283911671924
Precision: [0.         0.33333333 0.26785714 0.21126761 0.36746988 0.24489796
 0.40909091 0.33093525 0.19298246 0.26315789 0.05882353 0.
 0.        ]
Recall: [0.         0.38461538 0.23809524 0.22727273 0.41216216 0.21621622
 0.45205479 0.36507937 0.15068493 0.23809524 0.03703704 0.
 0.        ]
F1-Score:[0.         0.35714286 0.25210084 0.2189781  0.38853503 0.22966507
 0.42950108 0.34716981 0.16923077 0.25       0.04545455 0.
 0.        ]




In [38]:
# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [97]:
# Decision Tree
decision_tree = DecisionTreeClassifier(
    max_depth=9,               # Limit the depth of the tree
    min_samples_split=10,      # Minimum samples required to split a node
    min_samples_leaf=5,        # Minimum samples required at a leaf node
    max_features='sqrt',       # Number of features to consider for the best split
    max_leaf_nodes=20          # Maximum number of leaf nodes
)
decision_tree.fit(X_train, y_train)

In [40]:
# Fit the QDA model with the balanced dataset
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)



In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [98]:
models = {
    'QDA': qda,
    'Logistic Regression': log_reg,
    'KNN': knn,
    'Decision Tree': decision_tree
}

results = []
for model_name, model in models.items():
    y_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred, average=None, zero_division=0)
    recall = recall_score(y_train, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_train, y_pred, average=None, zero_division=0)
    results.append([model_name, accuracy, precision, recall, f1])

# Store the results in a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Print the results
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,QDA,0.379111,"[0.0, 1.0, 1.0, 0.1939655172413793, 0.55752212...","[0.0, 0.011363636363636364, 0.0488888888888888...","[0.0, 0.02247191011235955, 0.09322033898305085..."
1,Logistic Regression,0.514338,"[1.0, 0.6, 0.5303867403314917, 0.5323383084577...","[0.42857142857142855, 0.6647727272727273, 0.42...","[0.6, 0.6307277628032345, 0.4729064039408867, ..."
2,KNN,0.495659,"[0.125, 0.47346938775510206, 0.394636015325670...","[0.19047619047619047, 0.6590909090909091, 0.45...","[0.1509433962264151, 0.5510688836104513, 0.423..."
3,Decision Tree,0.299395,"[0.0, 0.0, 0.0, 0.0, 0.2834314550042052, 0.446...","[0.0, 0.0, 0.0, 0.0, 0.5644891122278057, 0.039...","[0.0, 0.0, 0.0, 0.0, 0.37737961926091823, 0.07..."


In [51]:
models = {
    'QDA': qda,
    'Logistic Regression': log_reg,
    'KNN': knn,
    'Decision Tree': decision_tree
}

results = []
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, zero_division=0)
    results.append([model_name, accuracy, precision, recall, f1])

# Store the results in a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Print the results
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,QDA,0.175605,"[0.0, 0.0, 0.0, 0.12188365650969529, 0.2660550...","[0.0, 0.0, 0.0, 0.6666666666666666, 0.19594594...","[0.0, 0.0, 0.0, 0.20608899297423888, 0.2256809..."
1,Logistic Regression,0.312303,"[0.0, 0.3333333333333333, 0.26785714285714285,...","[0.0, 0.38461538461538464, 0.23809523809523808...","[0.0, 0.35714285714285715, 0.25210084033613445..."
2,KNN,0.276551,"[0.3333333333333333, 0.16666666666666666, 0.24...","[0.1111111111111111, 0.3333333333333333, 0.285...","[0.16666666666666666, 0.2222222222222222, 0.26..."
3,Decision Tree,0.257624,"[0.0, 0.0, 0.0, 0.0, 0.22254335260115607, 0.19...","[0.0, 0.0, 0.0, 0.0, 0.5202702702702703, 0.045...","[0.0, 0.0, 0.0, 0.0, 0.3117408906882591, 0.072..."


In [44]:
print(X_test.shape)

(951, 282)
