# Importar bibliotecas

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Carregar dados

In [None]:
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "adilshamim8/turkiye-student-evaluation",
  file_path,
)

print("First 5 records:", df.head())

# Exibir informações gerais e descritivas

In [None]:
df.describe(include='all')

# Padronizar colunas de avaliação (Q1 a Q28)

In [None]:
question_cols = [col for col in df.columns if col.startswith('Q')]
scaler = StandardScaler()
df[question_cols] = scaler.fit_transform(df[question_cols])
df.head()

# Remover duplicatas

In [None]:
df = df.drop_duplicates()
df.shape

# Separar variáveis preditoras e alvo

In [None]:
X = df.drop("nb.repeat", axis=1)
y = df["nb.repeat"]

# Dividir os dados em treino, validação e teste

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

X_train.shape, X_val.shape, X_test.shape

# Treinar o modelo

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Avaliar o modelo

In [None]:
y_val_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_val_pred)
cm = confusion_matrix(y_val, y_val_pred)

# Exibir a matriz de confusão

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title(f"Acurácia: {acc:.2f}")
plt.show()

# Predição de exemplo

In [None]:
sample = X_test.iloc[0:1]
sample_pred = model.predict(sample)
print("Predição para o primeiro exemplo do teste:", sample_pred[0])