## Pré-Processamento

In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv(r'..\data\dataset.csv')

In [3]:
colunas_drop = [coluna for coluna in raw_data.columns if 'Curricular units' in coluna]
colunas_drop

['Curricular units 1st sem (credited)',
 'Curricular units 1st sem (enrolled)',
 'Curricular units 1st sem (evaluations)',
 'Curricular units 1st sem (approved)',
 'Curricular units 1st sem (grade)',
 'Curricular units 1st sem (without evaluations)',
 'Curricular units 2nd sem (credited)',
 'Curricular units 2nd sem (enrolled)',
 'Curricular units 2nd sem (evaluations)',
 'Curricular units 2nd sem (approved)',
 'Curricular units 2nd sem (grade)',
 'Curricular units 2nd sem (without evaluations)']

In [4]:
colunas_categoricas = ['Marital status', 'Application mode', 'Course', 'Daytime/evening attendance',
                        'Previous qualification', 'Nacionality', "Mother's qualification",
                        "Father's qualification", "Mother's occupation", "Father's occupation",
                        'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date',
                        'Gender', 'Scholarship holder', 'International']

colunas_numericas = ['Age at enrollment', 'Application order']

In [5]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()) 
])

In [6]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputing missing values with the most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, colunas_numericas),
        ('cat', categorical_transformer, colunas_categoricas)
    ])


In [8]:
dados_refinados = raw_data.drop(columns=colunas_drop)

In [9]:
X = dados_refinados.drop('Target', axis=1)
y = dados_refinados['Target']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train_preparado = preprocessor.fit_transform(X_train)
X_test_preparado = preprocessor.transform(X_test)

In [11]:
print(X_train_preparado.shape, X_test_preparado.shape)

(3539, 229) (885, 229)


## Modelos Supervisionados

### SVM

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm_parametros_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear']
}

svm_grid_search = GridSearchCV(SVC(random_state=0), svm_parametros_grid, cv=3, scoring='accuracy')
svm_grid_search.fit(X_train_preparado, y_train)

svm_params = svm_grid_search.best_params_
svm_score = svm_grid_search.best_score_

In [14]:
from sklearn.metrics import accuracy_score

svm_model = SVC(C=svm_params['C'], kernel=svm_params['kernel'])

svm_model.fit(X_train_preparado, y_train)

svm_predictions = svm_model.predict(X_test_preparado)

svm_test_accuracy  = accuracy_score(y_test, svm_predictions)

print(f"Acurácia para a SVM: {svm_test_accuracy:.2f}")

Acurácia para a SVM: 0.65


### Random Forest (Ensemble)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_parametros_grid = {
    'n_estimators': [100,200,300],
    'max_features': ['auto', 'sqrt']
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=0), rf_parametros_grid, cv=3, scoring='accuracy')
rf_grid_search.fit(X_train_preparado, y_train)

rf_params = rf_grid_search.best_params_
rf_score = rf_grid_search.best_score_

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\matme\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\matme\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\matme\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\matme\AppData\Local\Programs\Python\Python311\Lib\si

In [22]:
print(f"Melhores parâmetros para RF: {rf_params}")

Melhores parâmetros para RF: {'max_features': 'sqrt', 'n_estimators': 300}


In [23]:
rf_model = RandomForestClassifier(
    n_estimators=rf_params['n_estimators'],
    max_features=rf_params['max_features'],
    random_state=0
)

rf_model.fit(X_train_preparado, y_train)

rf_predictions = rf_model.predict(X_test_preparado)

rf_test_accuracy = accuracy_score(y_test, rf_predictions)

print(f"Acurácia para a RF: {rf_test_accuracy:.2f}")

Acurácia para a RF: 0.64
