In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv("vitaminas.csv")

In [7]:
print(df.columns.tolist())


['Date', 'Product Name', 'Category', 'Units Sold', 'Price', 'Revenue', 'Discount', 'Units Returned', 'Location', 'Platform']


In [10]:
# 2. Agrupar por producto (ajusta el nombre de columna si difiere)
df_grouped = df.groupby("Product Name").agg({
    "Discount": "mean",       # descuento promedio
    "Units Sold": "sum"       # total de unidades vendidas
}).reset_index()

X = df_grouped[["Discount"]]
y = df_grouped["Units Sold"]


In [11]:
# 3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
# =====================
# Modelo Lineal
# =====================
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)

In [13]:

# =====================
# Modelo Polinómico (grado 2)
# =====================
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)

In [14]:
# =====================
# Random Forest
# =====================
rf_model = RandomForestRegressor(random_state=42, n_estimators=200)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [18]:
def evaluar(y_true, y_pred, nombre):
    print(f"--- {nombre} ---")
    print("R²:", round(r2_score(y_true, y_pred), 3))
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print("RMSE:", round(rmse, 3))

In [21]:
evaluar(y_test, y_pred_lin, "Lineal")
evaluar(y_test, y_pred_poly, "Polinómico (grado 2)")
evaluar(y_test, y_pred_rf, "Random Forest")

--- Lineal ---
R²: -0.39
RMSE: 181.862
--- Polinómico (grado 2) ---
R²: -0.832
RMSE: 208.773
--- Random Forest ---
R²: -0.885
RMSE: 211.777


In [27]:
import pandas as pd

# 1. Cargar y limpiar los datos (igual que en los pasos anteriores)
df = pd.read_csv("data/sales_data.csv")
df.columns = df.columns.str.lower().str.replace(' ', '_')

# 2. Agrupar por producto y plataforma
# Y calcular la suma de las columnas deseadas
resumen_plataformas = df.groupby(['product_name', 'platform']).agg(
    total_units_sold=('units_sold', 'sum'),
    total_discount=('discount', 'sum')
).reset_index()

# 3. Mostrar el resultado
print("--- Resumen de Ventas por Producto, Plataforma y Descuento ---")
print(resumen_plataformas)

--- Resumen de Ventas por Producto, Plataforma y Descuento ---
          product_name platform  total_units_sold  total_discount
0          Ashwagandha   Amazon             14170           11.64
1          Ashwagandha  Walmart             13441           10.25
2          Ashwagandha    iHerb             13797           12.70
3                 BCAA   Amazon             13387           10.96
4                 BCAA  Walmart             14835           11.50
5                 BCAA    iHerb             12805           10.87
6               Biotin   Amazon             13682           10.44
7               Biotin  Walmart             13871           11.63
8               Biotin    iHerb             13980           11.12
9    Collagen Peptides   Amazon             13739           10.68
10   Collagen Peptides  Walmart             13852           12.42
11   Collagen Peptides    iHerb             13265           10.13
12            Creatine   Amazon             15273           11.73
13           