In [1]:
import sqlite3
import pandas as pd 
from pathlib import Path

current_dir = Path.cwd()
project_root = current_dir.parent.parent
CLEANED_DB_PATH = project_root/'scraper'/'data'/'cleaned_mobiles.db'

if not CLEANED_DB_PATH.exists():
    print(f"❌ Database not found at {CLEANED_DB_PATH}")
    print("Please run the scraper first: python scraper.py")
    sys.exit(1)

conn = sqlite3.connect(CLEANED_DB_PATH)
df = pd.read_sql_query("SELECT * FROM cleaned_mobiles;", conn)
conn.close()

if df.empty:
    print("❌ No data available for training!")
    print("Please run the scraper to collect data first.")
    sys.exit(1)

if len(df) < 100:
    print(f"⚠️ Warning: Only {len(df)} samples - model may not be reliable")
    print("Recommendation: Collect at least 500 samples for better accuracy")

# Rest of your code...

In [2]:
len(df)

11268

In [3]:
df.columns

Index(['URL', 'Title', 'Price', 'Published_time', 'Published_Date',
       'Seller_name', 'Location', 'Division', 'Condition', 'Model', 'Brand',
       'Features', 'Description', 'Img_urls', 'Date', 'RAM', 'Storage', 'lat',
       'lon', 'Network', 'Camera_Type', 'has_warranty', 'Battery',
       'Camera_Pixel', 'is_store'],
      dtype='object')

In [4]:
df.describe()

Unnamed: 0,Price,RAM,Storage,lat,lon,has_warranty,Battery,is_store
count,11268.0,11268.0,11268.0,11268.0,11268.0,11268.0,11268.0,11268.0
mean,17320.021122,7.433085,174.960064,23.873718,89.963023,0.488463,4752.339723,0.611555
std,15341.514624,2.359208,83.594988,0.736804,0.798475,0.499889,727.107614,0.487418
min,1500.0,1.0,16.0,21.442004,55.305202,0.0,500.0,0.0
25%,7899.75,6.0,128.0,23.709398,89.213416,0.0,4500.0,0.0
50%,12999.0,8.0,128.0,23.7776,90.363244,0.0,5000.0,1.0
75%,21990.0,8.0,256.0,23.936304,90.417363,1.0,5000.0,1.0
max,252000.0,24.0,1024.0,26.019957,92.468571,1.0,8600.0,1.0


In [5]:
df.columns

Index(['URL', 'Title', 'Price', 'Published_time', 'Published_Date',
       'Seller_name', 'Location', 'Division', 'Condition', 'Model', 'Brand',
       'Features', 'Description', 'Img_urls', 'Date', 'RAM', 'Storage', 'lat',
       'lon', 'Network', 'Camera_Type', 'has_warranty', 'Battery',
       'Camera_Pixel', 'is_store'],
      dtype='object')

In [6]:
Q1 = df['Price'].quantile(0.07)
Q3 = df['Price'].quantile(0.93)
df = df[(df['Price'] >= Q1) & (df['Price'] <= Q3)]

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


X = df.drop(columns=['Price', 'URL', 'Title', 'Description', 'Features','Published_time','Published_Date',"lat",'lon',"Seller_name","Img_urls","Date"])  # remove irrelevant text columns


low_card_cols = ['Brand', 'Condition', 'Network', 'Camera_Type',"Division","has_warranty","is_store"]   # low-cardinality → One-Hot
high_card_cols = ['Location', 'Model', 'Camera_Pixel']             # high-cardinality → Label Encoding
encoders = {}

# One-Hot encode low-cardinality features
X = pd.get_dummies(X, columns=low_card_cols)

# Label encode high-cardinality features
for col in high_card_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le   #  save encoder


y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train

Unnamed: 0,Location,Model,RAM,Storage,Battery,Camera_Pixel,Brand_Apple,Brand_Google,Brand_Helio,Brand_Honor,...,Division_ঢাকা,Division_বরিশাল,Division_ময়মনসিংহ,Division_রংপুর,Division_রাজশাহী,Division_সিলেট,has_warranty_0,has_warranty_1,is_store_0,is_store_1
3863,210,57,8.0,128.0,5000.0,49,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True
3785,150,365,6.0,128.0,4260.0,24,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1575,13,0,8.0,256.0,5000.0,77,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True
1905,13,18,8.0,256.0,5000.0,21,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True
6075,31,288,8.0,256.0,5000.0,67,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6728,14,241,6.0,128.0,5000.0,67,False,False,False,False,...,False,False,False,False,True,False,True,False,True,False
6095,90,48,6.0,128.0,4260.0,24,False,False,False,False,...,False,True,False,False,False,False,True,False,False,True
6323,164,314,8.0,128.0,5000.0,79,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True
1017,210,18,8.0,128.0,5000.0,67,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True


In [9]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize XGBoost Regressor
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=400,        # was 100 → increase
    max_depth=30,             # was 6 → deeper trees
    learning_rate=0.02,     
    subsample=0.8,           # add randomness
    colsample_bytree=0.8,    # add randomness
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [10]:
importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False)


Network_5G            0.624495
Brand_Apple           0.216496
Network_4G            0.069515
Brand_Google          0.017341
Brand_OPPO            0.005162
RAM                   0.004223
Brand_Realme          0.003579
Brand_Vivo            0.003576
Brand_SHARP           0.003490
Condition_Used        0.003377
Battery               0.003292
Brand_Xiaomi          0.003052
Brand_Samsung         0.003035
Brand_Nothing         0.002444
Condition_New         0.002288
Camera_Pixel          0.002196
Brand_Motorola        0.002137
Model                 0.001930
Storage               0.001914
Brand_OnePlus         0.001768
Division_রংপুর        0.001635
Brand_Tecno           0.001627
Brand_Infinix         0.001426
Brand_Huawei          0.001381
Network_Unknown       0.001352
Brand_iQOO            0.001322
Division_চট্টগ্রাম    0.001161
Brand_LG              0.001144
Brand_Honor           0.001105
Division_সিলেট        0.000910
is_store_1            0.000881
has_warranty_1        0.000878
Brand_ZT

In [11]:
y_pred = xgb_model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error (MSE): 10884800.68
Mean Absolute Error (MAE): 2064.49
R² Score: 0.83


In [12]:
import pickle
import os
from pathlib import Path

current_dir = Path.cwd()  
project_root = current_dir.parent  
website_models = project_root / 'website' / 'models'

# Create website/models directory
website_models.mkdir(parents=True, exist_ok=True)

# Save model
with open(website_models / 'xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Save encoders
with open(website_models / 'encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

# Save column names
with open(website_models / 'X_columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

# Save stats
stats = {
    'battery_median': df['Battery'].median(),
    'storage_median': df['Storage'].median(),
    'ram_median': df['RAM'].median()
}

with open(website_models / 'data_stats.pkl', 'wb') as f:
    pickle.dump(stats, f)

print(f"✅ All models saved to: {website_models}")

✅ All models saved to: C:\Users\Sinfo Tech Computer\3D Objects\bikroy\scraper\website\models
