ðŸ“˜ Notebook â€“ ClassificaÃ§Ã£o da Qualidade de Bananas

ðŸ“Œ 1. ImportaÃ§Ã£o das bibliotecas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

ðŸ“Œ 2. Carregamento da base de dados

In [None]:
df = pd.read_csv('banana_quality.csv')

df.head()

ðŸ“Œ 3. EstatÃ­sticas gerais

In [None]:
print("\nEstatÃ­sticas Descritivas")
df.describe().T

ðŸ“Œ 4. TransformaÃ§Ã£o da coluna Quality â†’ 0/1

In [None]:
le = LabelEncoder()
df['Quality01'] = le.fit_transform(df['Quality'])
nomeclasse = le.classes_

print("Qualidade codificada como: 0 = Bad, 1 = Good")

ðŸ“Œ 5. RemoÃ§Ã£o de linhas com valores fora do intervalo (-4, 4)

In [None]:
nmrs = df.select_dtypes(include=np.number).columns.tolist()

mask = (df[nmrs] > -4).all(axis=1) & (df[nmrs] < 4).all(axis=1)

df_filtered = df[mask].copy()

linhas_removidas = len(df) - len(df_filtered)
df = df_filtered

print("Linhas antes:", len(df) + linhas_removidas)
print("Linhas apÃ³s :", len(df))
print("Removidas   :", linhas_removidas)

ðŸ“Œ 6. SeparaÃ§Ã£o em treino, validaÃ§Ã£o e teste

In [None]:
X = df.drop(['Quality', 'Quality01'], axis=1)
y = df['Quality01']

X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_valid, y_train_valid, test_size=0.25, random_state=42, stratify=y_train_valid
)

print("Total: ", len(X))
print("Treino: ", len(X_train))
print("ValidaÃ§Ã£o: ", len(X_valid))
print("Teste: ", len(X_test))

ðŸ“Œ 7. NormalizaÃ§Ã£o

In [None]:
scaler = StandardScaler()

train_scaled = scaler.fit_transform(X_train)
valid_scaled = scaler.transform(X_valid)
test_scaled = scaler.transform(X_test)

ðŸ“Œ 8. GrÃ¡fico

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='Quality', data=df, color='yellow')
plt.title('Qualidade das Bananas')
plt.show()

ðŸ“Œ 9. Treinamento do modelo Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(train_scaled, y_train)

y_valid_pred = rf.predict(valid_scaled)

ðŸ“Œ 10. AvaliaÃ§Ã£o do modelo

In [None]:
mat_conf = confusion_matrix(y_valid, y_valid_pred)
acc = accuracy_score(y_valid, y_valid_pred)

print("AcurÃ¡cia:", acc)

plt.figure(figsize=(8, 6))
sns.heatmap(mat_conf, annot=True, fmt='d', cmap='inferno',
            xticklabels=nomeclasse, yticklabels=nomeclasse)
plt.title('Matriz de ConfusÃ£o')
plt.ylabel('Real')
plt.xlabel('PrediÃ§Ã£o')
plt.show()

ðŸ“Œ 11. PrediÃ§Ã£o de um exemplo

In [None]:
exemplo = X_test.iloc[[0]]
y_real = y_test.iloc[0]

exemplo_scaled = scaler.transform(exemplo)
pred = rf.predict(exemplo_scaled)[0]

print("\nAmostra:")
display(pd.DataFrame(exemplo_scaled, columns=X_test.columns, index=['Amostra']))

print("\nQualidade Predita:", nomeclasse[pred])
print("Qualidade Real:", nomeclasse[y_real])