In [1]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   - -------------------------------------- 2.1/72.0 MB 16.1 MB/s eta 0:00:05
   ---- ----------------------------------- 7.6/72.0 MB 22.2 MB/s eta 0:00:03
   ------- -------------------------------- 13.6/72.0 MB 25.0 MB/s eta 0:00:03
   --------- ------------------------------ 17.3/72.0 MB 25.6 MB/s eta 0:00:03
   ---------- ----------------------------- 18.4/72.0 MB 18.5 MB/s eta 0:00:03
   ----------- ---------------------------- 21.5/72.0 MB 19.4 MB/s eta 0:00:03
   -------------- ------------------------- 25.7/72.0 MB 19.1 MB/s eta 0:00:03
   -------------- ------------------------- 26.2/72.0 MB 16.7 MB/s eta 0:00:03
   -------------- ------------------------- 27.0/72.0 MB 15.0 MB/s eta 0:00:03
   ---------------- ----------------------- 29.1/72.0 MB 14.5 MB/s eta 0:0

In [11]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import time
import datetime

start_time = time.time()
print("--- ROZPOCZƒòCIE PROCESU ML ---")

# 1. PO≈ÅƒÑCZENIE
try:
    client = MongoClient('mongodb://localhost:27017/')
    db = client['otomoto_project']
    collection = db['raw_listings']
except:
    print("‚ùå B≈ÇƒÖd bazy.")
    exit()

cursor = collection.find({}, {'_id': 0, 'link': 0, 'scraped_at': 0, 'title': 0, 'location_city': 0})
df = pd.DataFrame(list(cursor))

if df.empty: exit()

# 2. CZYSZCZENIE (STANDARDOWE)
df = df.drop_duplicates()
df = df.dropna(subset=['price', 'year', 'mileage_km', 'horsepower_hp', 'brand', 'model', 'engine_capacity_cm3'])

current_year = datetime.datetime.now().year

# Filtry logiczne
df = df[(df['price'] > 3000) & (df['price'] < 3000000)]
df = df[(df['year'] >= 2000) & (df['year'] <= current_year + 1)]
df = df[(df['horsepower_hp'] > 40) & (df['horsepower_hp'] < 800)]
df = df[(df['engine_capacity_cm3'] > 500) & (df['engine_capacity_cm3'] < 8000)]

# Marki i Modele
df = df[df['brand'] != "Inna"]
popular_brands = df['brand'].value_counts().head(30).index.tolist()
df = df[df['brand'].isin(popular_brands)]

THRESHOLD = 30
model_counts = df['model'].value_counts()
popular_models = model_counts[model_counts >= THRESHOLD].index.tolist()
df.loc[~df['model'].isin(popular_models), 'model'] = 'Inny'

print(f" Dane podstawowe: {len(df)} rekord√≥w.")

# ==========================================
# 3. FEATURE ENGINEERING
# ==========================================
df['car_age'] = current_year - df['year']
df['car_age_squared'] = df['car_age'] ** 2

# Tu powstawa≈Çy b≈Çƒôdy (dzielenie przez zero). 
# Dodajemy zabezpieczenie +1 (epsilon)
df['km_per_year'] = df['mileage_km'] / (df['car_age'] + 1)
df['hp_per_liter'] = df['horsepower_hp'] / (df['engine_capacity_cm3'] / 1000)

print("‚úÖ Dodano nowe cechy.")

# ==========================================
# 3b. FIX DLA XGBOOST
# ==========================================
# Zamieniamy niesko≈Ñczono≈õci (inf) na NaN, a potem usuwamy
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(f"üõ°Ô∏è Dane po usuniƒôciu b≈Çƒôd√≥w matematycznych (inf): {len(df)}")
# ==========================================

# 4. PRZYGOTOWANIE MAPY MAREK
brand_model_map = {}
for brand in popular_brands:
    models = df[df['brand'] == brand]['model'].unique().tolist()
    brand_model_map[brand] = sorted([m for m in models if m != "Inny"]) + (["Inny"] if "Inny" in models else [])

# 5. ONE-HOT ENCODING
if 'currency' in df.columns: df = df.drop(columns=['currency'])
df_ml = pd.get_dummies(df, columns=['brand', 'model', 'fuel_type', 'transmission', 'seller_type', 'location_region'], drop_first=True)
df_ml = df_ml.select_dtypes(include=['int64', 'float64', 'uint8', 'bool'])

# 6. TUNING MODELU (RANDOM SEARCH)
X = df_ml.drop('price', axis=1)
y = df_ml['price']
y_log = np.log1p(y) 

X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

print("üöÄ Rozpoczynam poszukiwanie najlepszych parametr√≥w...")

param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [6, 8],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}

xgb = XGBRegressor(n_jobs=-1, random_state=42)

search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=10,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train_log)

print(f"üèÜ Najlepsze parametry: {search.best_params_}")
best_model = search.best_estimator_

# 7. OCENA
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log) 
y_test_real = np.expm1(y_test_log)

mae = mean_absolute_error(y_test_real, y_pred)
r2 = r2_score(y_test_real, y_pred)

print("-" * 40)
print(f"üìâ MAE (≈öredni B≈ÇƒÖd): {mae:.0f} PLN")
print(f"üéØ R2 Score:          {r2:.4f}")
print("-" * 40)

# 8. ZAPIS
joblib.dump(best_model, 'model_ceny_aut.pkl')
joblib.dump(X.columns, 'model_kolumny.pkl')
joblib.dump(brand_model_map, 'mapa_marka_model.pkl')

# Eksport CSV do Appki
df_visuals = df[df['price'] < 400000].copy()
cols_to_save = ['brand', 'model', 'year', 'price', 'horsepower_hp', 'fuel_type', 'mileage_km']
df_visuals[cols_to_save].to_csv('baza_aut_clean.csv', index=False)

--- ROZPOCZYNAMY PROCES ML: FEATURE ENGINEERING + TUNING ---
üßπ Dane podstawowe: 16571 rekord√≥w.
‚úÖ Dodano nowe cechy.
üõ°Ô∏è Dane po usuniƒôciu b≈Çƒôd√≥w matematycznych (inf): 15629
üöÄ Rozpoczynam poszukiwanie najlepszych parametr√≥w...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
üèÜ Najlepsze parametry: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
----------------------------------------
üìâ MAE (≈öredni B≈ÇƒÖd): 11412 PLN
üéØ R2 Score:          0.9125
----------------------------------------
