In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.svm import SVC
import joblib


In [64]:
df_train = pd.read_csv(r"D:\collage\lv3\Sem1\Artificial intelligence_\Project\Smart Phone Prices Prediction\train_processed.csv")
df_test  = pd.read_csv(r"D:\collage\lv3\Sem1\Artificial intelligence_\Project\Smart Phone Prices Prediction\test_processed.csv")


print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

Train shape: (857, 32)
Test shape: (153, 32)


In [65]:
y = df_train["Price_Encoded"]

feature_cols = [
    "rating",
    "Core_Count",
    "Clock_Speed_GHz",
    "RAM Size GB",
    "Storage Size GB",
    "battery_capacity",
    "fast_charging_power",
    "Screen_Size",
    "Resolution_Width",
    "Resolution_Height",
    "Refresh_Rate",
    "primary_rear_camera_mp",
    "num_rear_cameras",
    "primary_front_camera_mp",
    "num_front_cameras",
    "storage_gb",
    "Performance_Tier_Encoded",
    "Processor_Brand_Encoded",
    "RAM_Tier_Encoded",
    "Notch_Type_Encoded",
    "4G_Encoded",
    "Dual_Sim_Encoded",
    "5G_Encoded",
    "Vo5G_Encoded",
    "NFC_Encoded",
    "IR_Blaster_Encoded",
    "memory_card_support_Encoded",
    "os_name_Encoded",
    "brand_encoded_label",
    "os_version_label",
]

X = df_train[feature_cols]

# Test set features (same columns, fill missing with 0)
X_test_final = df_test.reindex(columns=feature_cols, fill_value=0)


In [66]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [67]:
# ================== 5) Define models ==================
log_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=5000,C=100,random_state=42, solver="lbfgs"))
])

rf_clf = RandomForestClassifier(
    n_estimators=50,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

xgb_clf = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

svc_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(kernel="rbf", C=1.0, gamma="scale", class_weight="balanced"))
])

models = {
    "Logistic Regression": log_clf,
    "Random Forest": rf_clf,
    "XGBoost": xgb_clf,
    "SVC": svc_clf
}


In [68]:
for name, model in models.items():
    print("=" * 60)
    print(name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)

    # convert to %
    print(f"Accuracy: {acc*100:.2f}%")

    report = classification_report(y_valid, y_pred, digits=3, output_dict=True)
    macro_f1 = report["macro avg"]["f1-score"]
    print(f"Macro F1: {macro_f1*100:.2f}%")

    # full text report still in decimals (0â€“1); or remove this if you want only %
    print(classification_report(y_valid, y_pred, digits=3))
    print()


Logistic Regression
Accuracy: 92.44%
Macro F1: 90.29%
              precision    recall  f1-score   support

           0      0.930     0.968     0.949       124
           1      0.907     0.812     0.857        48

    accuracy                          0.924       172
   macro avg      0.919     0.890     0.903       172
weighted avg      0.924     0.924     0.923       172


Random Forest
Accuracy: 94.77%
Macro F1: 93.18%
              precision    recall  f1-score   support

           0      0.939     0.992     0.965       124
           1      0.976     0.833     0.899        48

    accuracy                          0.948       172
   macro avg      0.957     0.913     0.932       172
weighted avg      0.949     0.948     0.946       172


XGBoost
Accuracy: 94.19%
Macro F1: 92.37%
              precision    recall  f1-score   support

           0      0.932     0.992     0.961       124
           1      0.975     0.812     0.886        48

    accuracy                        

In [69]:
best_model = rf_clf   # or choose based on the results above
best_model.fit(X_train, y_train)

test_pred = best_model.predict(X_test_final)

submission = pd.DataFrame({
    "id": df_test.index,        # replace with real ID column if you have one
    "Price_Encoded": test_pred
})
submission.to_csv("submission_best_model.csv", index=False)
print("Saved submission_best_model.csv")


Saved submission_best_model.csv


In [70]:
import joblib
joblib.dump(best_model, 'best_model.joblib')
print("Saved best_model.joblib")

Saved best_model.joblib
