In [None]:
import sqlite3
import pandas as pd 
from pathlib import Path

current_dir = Path.cwd()
project_root = current_dir.parent.parent
CLEANED_DB_PATH = project_root/'scraper'/'data'/'cleaned_mobiles.db'

if not CLEANED_DB_PATH.exists():
    print(f"❌ Database not found at {CLEANED_DB_PATH}")
    print("Please run the scraper first: python scraper.py")
    sys.exit(1)

conn = sqlite3.connect(CLEANED_DB_PATH)
df = pd.read_sql_query("SELECT * FROM cleaned_mobiles;", conn)
conn.close()

if df.empty:
    print("❌ No data available for training!")
    print("Please run the scraper to collect data first.")
    sys.exit(1)

if len(df) < 100:
    print(f"⚠️ Warning: Only {len(df)} samples - model may not be reliable")
    print("Recommendation: Collect at least 500 samples for better accuracy")

# Rest of your code...

In [31]:
len(df)

10163

In [19]:
df.columns

Index(['URL', 'Title', 'Price', 'Published_time', 'Published_Date',
       'Seller_name', 'Location', 'Division', 'Condition', 'Model', 'Brand',
       'Features', 'Description', 'Img_urls', 'Date', 'RAM', 'Storage', 'lat',
       'lon', 'Network', 'Camera_Type', 'has_warranty', 'Battery',
       'Camera_Pixel', 'is_store'],
      dtype='object')

In [20]:
df.describe()

Unnamed: 0,Price,RAM,Storage,lat,lon,has_warranty,Battery,is_store
count,10163.0,10163.0,10163.0,10163.0,10163.0,10163.0,10163.0,10163.0
mean,17125.691922,7.440126,175.934271,23.867458,89.960367,0.494637,4759.066122,0.621372
std,15347.639008,2.338386,83.613496,0.732103,0.805303,0.499996,714.94177,0.485069
min,1500.0,1.0,16.0,21.442004,55.305202,0.0,500.0,0.0
25%,7690.0,6.0,128.0,23.709398,89.213416,0.0,4500.0,0.0
50%,12500.0,8.0,128.0,23.7776,90.363244,0.0,5000.0,1.0
75%,21500.0,8.0,256.0,23.936304,90.415943,1.0,5000.0,1.0
max,252000.0,24.0,1024.0,26.019957,92.468571,1.0,8600.0,1.0


In [21]:
df.columns

Index(['URL', 'Title', 'Price', 'Published_time', 'Published_Date',
       'Seller_name', 'Location', 'Division', 'Condition', 'Model', 'Brand',
       'Features', 'Description', 'Img_urls', 'Date', 'RAM', 'Storage', 'lat',
       'lon', 'Network', 'Camera_Type', 'has_warranty', 'Battery',
       'Camera_Pixel', 'is_store'],
      dtype='object')

In [22]:
Q1 = df['Price'].quantile(0.07)
Q3 = df['Price'].quantile(0.93)
df = df[(df['Price'] >= Q1) & (df['Price'] <= Q3)]

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


X = df.drop(columns=['Price', 'URL', 'Title', 'Description', 'Features','Published_time','Published_Date',"lat",'lon',"Seller_name","Img_urls","Date"])  # remove irrelevant text columns


low_card_cols = ['Brand', 'Condition', 'Network', 'Camera_Type',"Division","has_warranty","is_store"]   # low-cardinality → One-Hot
high_card_cols = ['Location', 'Model', 'Camera_Pixel']             # high-cardinality → Label Encoding
encoders = {}

# One-Hot encode low-cardinality features
X = pd.get_dummies(X, columns=low_card_cols)

# Label encode high-cardinality features
for col in high_card_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le   #  save encoder


y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
X_train

Unnamed: 0,Location,Model,RAM,Storage,Battery,Camera_Pixel,Brand_Apple,Brand_Google,Brand_Helio,Brand_Honor,...,Division_ঢাকা,Division_বরিশাল,Division_ময়মনসিংহ,Division_রংপুর,Division_রাজশাহী,Division_সিলেট,has_warranty_0,has_warranty_1,is_store_0,is_store_1
7156,175,220,6.0,128.0,4410.0,35,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
6647,13,11,8.0,128.0,5000.0,66,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True
9035,23,12,8.0,128.0,4500.0,94,False,False,False,False,...,False,False,False,False,True,False,True,False,True,False
3286,162,277,4.0,64.0,4000.0,59,False,False,False,False,...,True,False,False,False,False,False,True,False,True,False
9189,136,332,8.0,256.0,6000.0,36,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6708,130,220,6.0,128.0,4410.0,35,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
6082,69,174,4.0,64.0,5000.0,2,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
6302,95,301,8.0,128.0,4500.0,44,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True
1014,207,17,8.0,128.0,5000.0,94,False,False,False,False,...,True,False,False,False,False,False,False,True,False,True


In [25]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize XGBoost Regressor
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=400,        # was 100 → increase
    max_depth=30,             # was 6 → deeper trees
    learning_rate=0.02,     
    subsample=0.8,           # add randomness
    colsample_bytree=0.8,    # add randomness
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [26]:
importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
importances.sort_values(ascending=False)


Network_5G            0.595014
Brand_Apple           0.261571
Network_4G            0.056142
Brand_Google          0.018912
Brand_OPPO            0.004571
RAM                   0.004316
Brand_Realme          0.003677
Brand_Xiaomi          0.003527
Battery               0.003229
Brand_Samsung         0.003209
Brand_Nothing         0.002877
Condition_Used        0.002874
Brand_Motorola        0.002598
Camera_Pixel          0.002388
Brand_Vivo            0.002373
Condition_New         0.002268
Brand_OnePlus         0.002075
Model                 0.001948
Storage               0.001944
Division_রংপুর        0.001912
Brand_Huawei          0.001827
Brand_Infinix         0.001497
Brand_Honor           0.001265
Brand_LG              0.001160
Brand_Tecno           0.001101
is_store_1            0.001026
Division_চট্টগ্রাম    0.000982
Division_সিলেট        0.000893
Division_ঢাকা         0.000856
Brand_Helio           0.000853
has_warranty_1        0.000837
Brand_Sony            0.000834
is_store

In [27]:
y_pred = xgb_model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error (MSE): 11105025.19
Mean Absolute Error (MAE): 2093.32
R² Score: 0.81


In [29]:
import pickle
import os
from pathlib import Path

current_dir = Path.cwd()  
project_root = current_dir.parent  
website_models = project_root / 'website' / 'models'

# Create website/models directory
website_models.mkdir(parents=True, exist_ok=True)

# Save model
with open(website_models / 'xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Save encoders
with open(website_models / 'encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

# Save column names
with open(website_models / 'X_columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

# Save stats
stats = {
    'battery_median': df['Battery'].median(),
    'storage_median': df['Storage'].median(),
    'ram_median': df['RAM'].median()
}

with open(website_models / 'data_stats.pkl', 'wb') as f:
    pickle.dump(stats, f)

print(f"✅ All models saved to: {website_models}")

✅ All models saved to: c:\Users\Sinfo Tech Computer\3D Objects\bikroy\scraper\website\models
