In [47]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import joblib

df = pd.read_csv("ecommerce_sales.csv")

In [48]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

1. Tratamento de Valores Faltantes
Para numéricas, use mediana devido a outliers. Para categóricas, use moda.

In [49]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

imputer_num = SimpleImputer(strategy='median')
df[num_cols] = imputer_num.fit_transform(df[num_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

2. Tratamento de Outliers
Cap outliers usando IQR para colunas com extremos identificados na EDA.

In [50]:
for col in ['avg_price', 'return_rate']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df[col] = np.clip(df[col], Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)

3. Encoding de Variáveis Categóricas
One-hot para nominais; ordinal para seasonality (assumindo ordem baixa/média/alta).

In [51]:
df = pd.get_dummies(df, columns=['free_shipping', 'product_category'], drop_first=True)

# Limpeza da coluna
df['seasonality'] = df['seasonality'].str.strip().str.lower()

ordinal_encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']])
df['seasonality'] = ordinal_encoder.fit_transform(df[['seasonality']])

4. Normalização de Variáveis Numéricas
Use StandardScaler e salve o objeto.

In [52]:
import os  # Adicione se necessário

os.makedirs('../models', exist_ok=True)  # Cria a pasta no root

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
joblib.dump(scaler, '../models/scaler.pkl')  # Salva no root

['../models/scaler.pkl']

5. Feature Engineering (Opcional)
Crie ratio de eficiência e interação de tráfego/conversão.

In [53]:
df['efficiency_ratio'] = df['monthly_sales'] / df['marketing_spend']
df['traffic_conversion'] = df['website_traffic'] * df['conversion_rate']

In [54]:
import os  # Adicione se necessário

os.makedirs('../data', exist_ok=True)  # Cria a pasta no root

df.to_csv("../data/ecommerce_sales_clean.csv", index=False)