# üìò ETAPA 3 ‚Äî Modelo Baseline (Vers√£o Melhorada)
Este notebook foi gerado automaticamente com ajustes adicionais:
- Gr√°fico Valores Reais vs Previstos
- Top 3 features mais importantes
- Organiza√ß√£o mais clara para apresenta√ß√£o


## 1. Importa√ß√µes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import joblib
import os


## 2. Carregar Dataset

In [None]:
df = pd.read_csv('dataset_preprocessado.csv')
print('Shape:', df.shape)
df.head()

## 3. Preparar Dados (X e y)

In [None]:
# Remover colunas irrelevantes
if 'sale_id' in df.columns:
    df = df.drop(columns=['sale_id'])

# Selecionar vari√°vel target
y = df['monthly_sales']
X = df.drop(columns=['monthly_sales'])

# Garantir que tudo √© num√©rico
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

print('X shape:', X.shape)
print('y shape:', y.shape)


## 4. Divis√£o 60/20/20

In [None]:
# 60% treino, 40% tempor√°rio
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.40, random_state=42)

# 20% val, 20% teste
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

print("Treino:", X_train.shape)
print("Valida√ß√£o:", X_val.shape)
print("Teste:", X_test.shape)


## 5. Treinar Modelo Linear

In [None]:
modelo = LinearRegression()
modelo.fit(X_train, y_train)

y_pred_train = modelo.predict(X_train)
y_pred_val = modelo.predict(X_val)

print("Modelo treinado!")

## 6. M√©tricas de Desempenho

In [None]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_val = mean_squared_error(y_val, y_pred_val)

rmse_train = np.sqrt(mse_train)
rmse_val = np.sqrt(mse_val)

mae_train = mean_absolute_error(y_train, y_pred_train)
mae_val = mean_absolute_error(y_val, y_pred_val)

r2_train = r2_score(y_train, y_pred_train)
r2_val = r2_score(y_val, y_pred_val)

print("=== M√©tricas Treino ===")
print("MSE:", mse_train)
print("RMSE:", rmse_train)
print("MAE:", mae_train)
print("R¬≤:", r2_train)
print("
=== M√©tricas Valida√ß√£o ===")
print("MSE:", mse_val)
print("RMSE:", rmse_val)
print("MAE:", mae_val)
print("R¬≤:", r2_val)


## 7. Gr√°fico: Valores Reais vs Previstos (VAL)

In [None]:
plt.figure(figsize=(7,5))
plt.scatter(y_val, y_pred_val, alpha=0.6)
plt.xlabel("Valores reais")
plt.ylabel("Previs√µes")
plt.title("Valores Reais vs Previstos - Valida√ß√£o")
plt.grid(True)
plt.show()


## 8. Gr√°fico de Res√≠duos

In [None]:
residuos = y_val - y_pred_val

plt.figure(figsize=(7,5))
plt.scatter(y_pred_val, residuos, alpha=0.6)
plt.axhline(0, color='red')
plt.xlabel("Previs√µes")
plt.ylabel("Res√≠duos")
plt.title("Res√≠duos vs Previs√µes")
plt.grid(True)
plt.show()


## 9. Top 3 Features Mais Importantes

In [None]:
coeficientes = pd.Series(modelo.coef_, index=X_train.columns)
top3 = coeficientes.abs().sort_values(ascending=False).head(3)
print("Top 3 features mais importantes:")
print(top3)


## 10. Salvar Modelo

In [None]:
os.makedirs('models', exist_ok=True)
joblib.dump(modelo, 'models/modelo_baseline_melhorado.pkl')
print("Modelo salvo em models/modelo_baseline_melhorado.pkl")
