In [1]:
import numpy as np
import pandas as pd
import dash
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from ipywidgets import interact, FloatSlider, Dropdown
from dash import dcc, html, Input, Output
import plotly.express as px
import joblib

In [2]:
arquivo_alunos = "ID_EEEFM_TERRA_01407.txt"

In [3]:
df_alunos = pd.read_csv(
    arquivo_alunos,
    sep='|',
    header= None,
    engine='python',
    encoding='latin1'
)

df_alunos.columns = [
    "MATRICULA",
    "ESCOLA",
    "NOME",
    "DATA_NASCIMENTO",
    "GENERO",
    "COR",
    "SERIE",
    "NIVEL_ENSINO",
    "TURMA",
    "TURNO"
]

df_alunos.reindex

<bound method DataFrame.reindex of       MATRICULA                ESCOLA                                   NOME  \
0        223546  EEEFM TERRA VERMELHA                  BRENDA DE JESUS GOMES   
1        314179  EEEFM TERRA VERMELHA        BRENO CESAR TEIXEIRA DOS SANTOS   
2        233721  EEEFM TERRA VERMELHA       CELIDALVA PEREIRA DE SOUZA BRITO   
3         66590  EEEFM TERRA VERMELHA                CINTHIA DE JESUS FONTES   
4        235342  EEEFM TERRA VERMELHA                  ERICK TONINI GONORING   
...         ...                   ...                                    ...   
1333     222551  EEEFM TERRA VERMELHA  MAYRA TERESA NASCIMENTO DO PATROCINIO   
1334     286937  EEEFM TERRA VERMELHA      ESTHER MARIANO VENCESLAU DA SILVA   
1335     149713  EEEFM TERRA VERMELHA                 GABRIEL QUEIROS CABRAL   
1336     150138  EEEFM TERRA VERMELHA            CARLOS EDUARDO FELLER PAIVA   
1337     149784  EEEFM TERRA VERMELHA                      HIAGO SOUZA NEVES   

    

In [4]:
n = len(df_alunos)

In [5]:
df_alunos.head()

Unnamed: 0,MATRICULA,ESCOLA,NOME,DATA_NASCIMENTO,GENERO,COR,SERIE,NIVEL_ENSINO,TURMA,TURNO
0,223546,EEEFM TERRA VERMELHA,BRENDA DE JESUS GOMES,15/04/2006,FEMININO,PARDA,4ª ETAPA,EJA PROFISSIONAL,4ªN01-EJA-EP-AAD,NOITE
1,314179,EEEFM TERRA VERMELHA,BRENO CESAR TEIXEIRA DOS SANTOS,25/05/2001,MASCULINO,BRANCA,4ª ETAPA,EJA PROFISSIONAL,4ªN01-EJA-EP-AAD,NOITE
2,233721,EEEFM TERRA VERMELHA,CELIDALVA PEREIRA DE SOUZA BRITO,15/06/1964,FEMININO,PARDA,4ª ETAPA,EJA PROFISSIONAL,4ªN01-EJA-EP-AAD,NOITE
3,66590,EEEFM TERRA VERMELHA,CINTHIA DE JESUS FONTES,05/11/1984,FEMININO,PARDA,4ª ETAPA,EJA PROFISSIONAL,4ªN01-EJA-EP-AAD,NOITE
4,235342,EEEFM TERRA VERMELHA,ERICK TONINI GONORING,14/10/2000,MASCULINO,PARDA,4ª ETAPA,EJA PROFISSIONAL,4ªN01-EJA-EP-AAD,NOITE


In [6]:
df_frequencia = pd.read_csv('ANALISE DE FREQUENCIA - EEEFM TERRA VERMELHA - 2025.csv')

In [7]:
df_frequencia.head()

Unnamed: 0,ESCOLA,ANO DE ESCOLARIDADE,Jan,Fev,Mar,Abr,Mai,Jun,Jul,Ago,Set,Out,Nov,Dez,TOTAL
0,EEEFM TERRA VERMELHA,9º ANO,0,1120,1419,2045,1901,945,0,0,0,0,0,0,7430
1,EEEFM TERRA VERMELHA,1ª SERIE,0,1853,4005,5268,6248,3859,0,0,0,0,0,0,21233
2,EEEFM TERRA VERMELHA,2ª SERIE,0,2186,4596,5208,5760,3096,0,0,0,0,0,0,20846
3,EEEFM TERRA VERMELHA,3ª SERIE,0,511,964,1251,1846,1079,0,0,0,0,0,0,5651
4,TOTAL,,0,5670,10984,13772,15755,8979,0,0,0,0,0,0,55160


In [8]:
n = len(df_alunos)

df_alunos["frequencia"] = np.clip(np.random.normal(85, 10, n), 50, 100).round(1)
df_alunos["nota"] = np.clip(np.random.normal(7, 1.5, n), 0, 10).round(1)

In [9]:
df_alunos["evasao"] = np.where((df_alunos["frequencia"] < 70) | (df_alunos["nota"] < 5), 1, 0)

In [10]:
df_alunos.to_csv("alunos_completos.csv", index=False)
print("✔️  CSV salvo como alunos_completos.csv")

✔️  CSV salvo como alunos_completos.csv


In [11]:
X = df_alunos.drop(["evasao", "MATRICULA", "NOME", "DATA_NASCIMENTO"], axis=1)
y = df_alunos["evasao"]
X = pd.get_dummies(X, drop_first=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

scaler = StandardScaler()

In [13]:
numerical_cols = ['frequencia', 'nota']
for col in numerical_cols:
    if col not in X_train.columns:
        print(f"Warning: Column '{col}' not found in X_train. Skipping scaling for this column.")

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols]  = scaler.transform(X_test[numerical_cols])


In [14]:
model = RandomForestClassifier(
    n_estimators=300, max_depth=None, class_weight="balanced", random_state=42
)
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)
print("Confusion Matrix for RandomForestClassifier:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report for RandomForestClassifier:")
print(classification_report(y_test, y_pred, digits=3))

Confusion Matrix for RandomForestClassifier:
[[284   0]
 [  0  51]]
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       284
           1      1.000     1.000     1.000        51

    accuracy                          1.000       335
   macro avg      1.000     1.000     1.000       335
weighted avg      1.000     1.000     1.000       335



In [16]:
X_scaled = scaler.fit_transform(X)

In [17]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [18]:
print("\nConfusion Matrix for KNeighborsClassifier:")
knn_y_pred = knn_model.predict(X_test)
print(confusion_matrix(y_test, knn_y_pred))
print("Classification Report for KNeighborsClassifier:")
print(classification_report(y_test, knn_y_pred, digits=3))


Confusion Matrix for KNeighborsClassifier:
[[282   2]
 [ 34  17]]
Classification Report for KNeighborsClassifier:
              precision    recall  f1-score   support

           0      0.892     0.993     0.940       284
           1      0.895     0.333     0.486        51

    accuracy                          0.893       335
   macro avg      0.894     0.663     0.713       335
weighted avg      0.893     0.893     0.871       335



In [19]:
joblib.dump(model, "modelo_randomforest.pkl")
joblib.dump(knn_model, "modelo_knn.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "colunas_treinadas.pkl")

['colunas_treinadas.pkl']

In [20]:
df_dash = pd.read_csv('alunos_completos.csv')

def filtrar_dados(frequencia_min=0.0, nota_min=0.0, turno='Todos'):
    dados_filtrados = df_dash[
        (df_dash['frequencia'] >= frequencia_min) &\
        (df_dash['nota'] >= nota_min)
    ]

    if turno != 'Todos':
        dados_filtrados = dados_filtrados[dados_filtrados['TURNO'].astype(str) == turno]

    plt.figure(figsize=(10, 5))
    plt.bar(dados_filtrados['NOME'], dados_filtrados['frequencia'], color='orange')
    plt.xticks(rotation=90)
    plt.title('Alunos com maior risco de evasão (baixa frequência)')
    plt.ylabel('Frequência (%)')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

turnos = sorted(df_dash['TURNO'].dropna().astype(str).unique())

interact(
    filtrar_dados,
    frequencia_min=FloatSlider(min=0, max=100, step=5, value=50),
    nota_min=FloatSlider(min=0, max=10, step=0.5, value=5),
    turno=Dropdown(options=['Todos'] + list(turnos), value='Todos')
);

interactive(children=(FloatSlider(value=50.0, description='frequencia_min', step=5.0), FloatSlider(value=5.0, …