In [110]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib

import warnings
warnings.filterwarnings("ignore")

In [58]:
df = pd.read_csv('https://raw.githubusercontent.com/Rogeriom49/datathon_fiap/refs/heads/main/datasets/alunos.csv')

In [59]:
df.head()

Unnamed: 0,NOME,ANO,PONTO_VIRADA,FASE,TURMA,PEDRA,INDE,IAA,IEG,IPS,IDA,IPP,IPV,IAN,IDADE
0,ALUNO-1,2020,Não,2.0,H,Ametista,7.88,8.5,8.7,7.5,7.0,5.94,7.75,10.0,11.0
1,ALUNO-3,2020,Não,3.0,H,Ametista,7.86,7.92,8.9,7.5,5.5,8.12,8.11,10.0,12.0
2,ALUNO-4,2020,Não,1.0,D,Quartzo,5.08,8.0,4.1,6.88,0.0,7.19,7.75,5.0,10.0
3,ALUNO-5,2020,Não,2.0,M,Ametista,8.08,7.5,8.0,7.5,7.5,8.44,8.17,10.0,10.0
4,ALUNO-8,2020,Sim,4.0,L,Ametista,8.38,8.33,9.9,4.38,7.33,8.75,8.94,10.0,14.0


In [60]:
df_model = df.drop(columns=['NOME','PONTO_VIRADA','FASE','TURMA'])

In [61]:
# 0 = Em risco; 1 = Não está em risco
df_model['RISCO'] = df_model['PEDRA'].apply(lambda x: 1 if x != 'Quartzo' else 0).drop(columns=['PEDRA'])

In [62]:
df_model_new = df_model.fillna(0)

In [63]:
df_model_new

Unnamed: 0,ANO,PEDRA,INDE,IAA,IEG,IPS,IDA,IPP,IPV,IAN,IDADE,RISCO
0,2020,Ametista,7.88,8.50,8.70,7.50,7.00,5.94,7.75,10.0,11.0,1
1,2020,Ametista,7.86,7.92,8.90,7.50,5.50,8.12,8.11,10.0,12.0,1
2,2020,Quartzo,5.08,8.00,4.10,6.88,0.00,7.19,7.75,5.0,10.0,0
3,2020,Ametista,8.08,7.50,8.00,7.50,7.50,8.44,8.17,10.0,10.0,1
4,2020,Ametista,8.38,8.33,9.90,4.38,7.33,8.75,8.94,10.0,14.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2271,2022,Topázio,8.15,7.92,9.67,7.50,7.75,6.25,7.50,10.0,0.0,1
2272,2022,Quartzo,5.67,7.42,6.47,6.25,2.00,6.25,7.44,5.0,11.0,0
2273,2022,Topázio,8.14,9.00,9.03,7.50,9.00,5.62,9.08,5.0,0.0,1
2274,2022,Ágata,6.75,8.33,9.18,5.62,5.90,6.72,5.83,5.0,13.0,1


Ordem dos dados que devem ser passado na previsão
ANO
IAA
IEG
IPS
IDA
IPP
IPV
IAN

In [97]:
df_model_new['RISCO'].value_counts()

RISCO
1    1904
0     372
Name: count, dtype: int64

In [127]:
X = df_model_new.drop(columns=['RISCO', 'PEDRA','IDADE','INDE'])
y = df_model_new['RISCO']

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [129]:
smote = SMOTE(random_state=42)
X_train_resempled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [130]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [131]:
results = {}
for name, model in models.items():
    # Validação cruzada para avaliar desempenho
    cv_scores = cross_val_score(model, X_train_resempled, y_train_resampled, cv=5)
    model.fit(X_train_resempled, y_train_resampled)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1_Score = f1_score(y_test, y_pred)
    precision_Score = precision_score(y_test, y_pred)
    recall_Score = recall_score(y_test, y_pred)
    results[name] = {
        "Cross-Validation Score (Mean)": np.mean(cv_scores),
        "Acurácia": round(accuracy * 100,2),
        "f1 score": f1_Score,
        "precision score": precision_Score,
        "recall score" : recall_Score
    }

In [132]:
# Gradient Boosting
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
bst.fit(X_train_resempled, y_train_resampled)
preds = bst.predict(X_test)
accuracy_bst = accuracy_score(y_test, preds)
f1_Score_bst = f1_score(y_test, preds)
precision_Score_bst = precision_score(y_test, y_pred)
recall_Score_bst = recall_score(y_test, y_pred)

In [133]:
results['Gradient Boosting'] = {
    "Cross-Validation Score (Mean)": 0,
    "Acurácia": round(accuracy_bst * 100,2),
    "f1 score": f1_Score_bst,
    "precision score": precision_Score_bst,
    "recall score" : recall_Score_bst    
}

In [134]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Cross-Validation Score (Mean),Acurácia,f1 score,precision score,recall score
Random Forest,0.981946,96.49,0.978947,0.98153,0.976378
Support Vector Machine,0.580327,95.39,0.972692,0.963918,0.981627
Logistic Regression,0.966187,96.49,0.978836,0.986667,0.971129
Decision Tree,0.961271,95.39,0.972549,0.96875,0.976378
K-Nearest Neighbors,0.974721,94.3,0.965147,0.986301,0.944882
Gradient Boosting,0.0,90.79,0.942779,0.986301,0.944882


In [135]:
results_df.to_csv('../datasets/model_resport.csv', sep=';')

In [136]:
# Melhor modelo baseado na métrica de validação cruzada
best_model_name = results_df["f1 score"].idxmax()
best_model = models[best_model_name]
print(f"\nMelhor modelo: {best_model_name}")


Melhor modelo: Random Forest


In [137]:
filename = f"../best_model_{best_model_name.replace(' ', '_').lower()}.pkl"
joblib.dump(best_model, filename)
print(f"Melhor modelo salvo como: {filename}")

Melhor modelo salvo como: ../best_model_random_forest.pkl


In [138]:
loaded_model = joblib.load(filename)

In [148]:
dados = {'ANO':2024,
'IAA':2,
'IEG':3,
'IPS':4,
'IDA':5,
'IPP':6,
'IPV':7,
'IAN':1}

df_modelo = pd.DataFrame([dados])

In [149]:
pred = loaded_model.predict(df_modelo)
pred_proba = loaded_model.predict_proba(df_modelo)

In [151]:
display(pred[0])
display(pred_proba[0][1])

np.int64(0)

np.float64(0.16)