In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import zipfile
import os

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


In [71]:
train_datas_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv'

def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

download_file(train_datas_url, 'module4_exercise_train.zip')
download_file(test_data_url, 'Neighborhood_Market_data.csv')


Downloaded module4_exercise_train.zip from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/module4_exercise_train.zip
Downloaded Neighborhood_Market_data.csv from https://www.raphaelcousin.com/modules/data-science-practice/module4/exercise/Neighborhood_Market_data.csv


In [72]:

with zipfile.ZipFile("module4_exercise_train.zip", "r") as zip_ref:
    zip_ref.extractall("train_data")


df_city = pd.read_csv("train_data/CityMart_data.csv")
df_green = pd.read_csv("train_data/Greenfield_Grocers_data.csv")
df_outlet = pd.read_excel("train_data/SuperSaver_Outlet_data.xlsx")
df_bazaar = pd.read_json("train_data/HighStreet_Bazaar_data.json")

df_train = pd.concat([df_city, df_green, df_outlet, df_bazaar], ignore_index=True)
df_test = pd.read_csv("Neighborhood_Market_data.csv")

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)


Train shape: (1594, 12)
Test shape: (409, 10)


In [73]:
import requests
import pandas as pd


url_auth = "https://www.raphaelcousin.com/api/exercise/auth"
r = requests.get(url_auth)
r.raise_for_status()
auth_data = r.json()

password = auth_data["data"]["password"]
print("🔑 Password récupéré :", password)


url_prices = f"https://www.raphaelcousin.com/api/exercise/{password}/prices"
r = requests.get(url_prices)
r.raise_for_status()
prices_data = r.json()


data_dict = prices_data["data"]
df_prices = pd.DataFrame(list(data_dict.items()), columns=["item_code", "unit_cost"])

print(" Aperçu de df_prices :")
print(df_prices.head())


df_train = df_train.merge(df_prices, on="item_code", how="left")
df_test = df_test.merge(df_prices, on="item_code", how="left")

print(" Colonne unit_cost ajoutée :", df_prices.shape)



🔑 Password récupéré : RcUZjhdsYLRzwi4
 Aperçu de df_prices :
  item_code  unit_cost
0     P0001      22.14
1     P0002      26.91
2     P0003      16.90
3     P0004       7.04
4     P0005      20.84
 Colonne unit_cost ajoutée : (2000, 2)


In [74]:
!pip install selenium chromedriver-autoinstaller




In [75]:
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time


chromedriver_autoinstaller.install()


chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")


driver = webdriver.Chrome(options=chrome_options)


url = "https://www.raphaelcousin.com/module4/scrapable-data"
driver.get(url)
time.sleep(5)


html = driver.page_source
driver.quit()


soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all("table")
print(" Nombre de tables trouvées :", len(tables))


rows = []
for row in exercise_table.find("tbody").find_all("tr"):
    cols = [col.text.strip() for col in row.find_all("td")]
    rows.append(cols)


df_exercise = pd.DataFrame(
    rows,
    columns=["item_code", "customer_score", "total_reviews", "extra_col"]
)


df_exercise = df_exercise[["item_code", "customer_score", "total_reviews"]]


df_exercise["customer_score"] = pd.to_numeric(df_exercise["customer_score"], errors="coerce")
df_exercise["total_reviews"] = pd.to_numeric(df_exercise["total_reviews"], errors="coerce")

print(" Aperçu du scraping corrigé :")
print(df_exercise.head())


df_train = df_train.merge(df_exercise, on="item_code", how="left")
df_test = df_test.merge(df_exercise, on="item_code", how="left")

print(" Colonnes customer_score et total_reviews ajoutées :", df_train.shape, df_test.shape)


 Nombre de tables trouvées : 2
 Aperçu du scraping corrigé :
  item_code  customer_score  total_reviews
0     P0001               2            972
1     P0002               3            260
2     P0003               2            285
3     P0004               5            512
4     P0005               3             85
 Colonnes customer_score et total_reviews ajoutées : (1594, 15) (409, 13)


In [76]:

df_train = df_train.dropna(subset=["quantity_sold"])
print(" Après nettoyage, taille train :", df_train.shape)


if "unit_cost_x" in df_train.columns and "unit_cost_y" in df_train.columns:
    df_train["unit_cost"] = df_train[["unit_cost_x", "unit_cost_y"]].mean(axis=1)
    df_train = df_train.drop(columns=["unit_cost_x", "unit_cost_y"])

if "unit_cost_x" in df_test.columns and "unit_cost_y" in df_test.columns:
    df_test["unit_cost"] = df_test[["unit_cost_x", "unit_cost_y"]].mean(axis=1)
    df_test = df_test.drop(columns=["unit_cost_x", "unit_cost_y"])


cols_to_keep = [
    "mass", "dimension_length", "dimension_width", "dimension_height",
    "days_since_last_purchase", "package_volume", "stock_age",
    "unit_cost", "customer_score", "total_reviews"
]


common_cols = [c for c in cols_to_keep if c in df_test.columns and c in df_train.columns]

print("Colonnes finales train :", df_train[common_cols + ["quantity_sold"]].columns.tolist())
print("Colonnes finales test :", df_test[common_cols].columns.tolist())

X_train = df_train[common_cols + ["quantity_sold"]]
X_test = df_test[common_cols]


mae, preds = get_simple_baseline(
    data=X_train,
    target_col="quantity_sold",
    k_fold=5,
    scaler="standard",
    X_data_test=X_test
)

print(f" MAE obtenu : {mae:.2f}")


submission = pd.DataFrame({
    "item_code": df_test["item_code"].values,   # ✅ on prend la vraie colonne item_code
    "quantity_sold": np.maximum(0, preds.astype(int))
})

submission.to_csv("submission.csv", index=False)
print("✅ submission.csv généré avec", len(submission), "lignes")
print(submission.head())



 Après nettoyage, taille train : (1190, 15)
Colonnes finales train : ['mass', 'dimension_length', 'dimension_width', 'dimension_height', 'days_since_last_purchase', 'package_volume', 'stock_age', 'unit_cost', 'customer_score', 'total_reviews', 'quantity_sold']
Colonnes finales test : ['mass', 'dimension_length', 'dimension_width', 'dimension_height', 'days_since_last_purchase', 'package_volume', 'stock_age', 'unit_cost', 'customer_score', 'total_reviews']
 MAE moyen (cross-val): 20.704061924114214
 MAE obtenu : 20.70
✅ submission.csv généré avec 409 lignes
  item_code  quantity_sold
0     P0002            196
1     P0004            278
2     P0005            192
3     P0010            260
4     P0013            271
