In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
df = pd.read_csv("train.csv")
df.head()


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
X = df.drop("price_range", axis=1)
y = df["price_range"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
rf = RandomForestClassifier(
    random_state=42
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("Baseline Accuracy:", accuracy_score(y_test, y_pred))


Baseline Accuracy: 0.88


In [6]:
param_grid = {
    "n_estimators": [50, 100, 150, 200, 300]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


In [7]:
print("Best n_estimators:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Best n_estimators: {'n_estimators': 100}
Best CV Accuracy: 0.86875


In [8]:
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

y_pred_best = best_rf.predict(X_test)

print("Final Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))


Final Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       100
           1       0.82      0.84      0.83       100
           2       0.81      0.79      0.80       100
           3       0.93      0.93      0.93       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



In [9]:
importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

importance.head(10)


Unnamed: 0,Feature,Importance
13,ram,0.480768
0,battery_power,0.072976
12,px_width,0.056089
11,px_height,0.056
8,mobile_wt,0.039007
6,int_memory,0.034837
16,talk_time,0.031891
10,pc,0.029158
2,clock_speed,0.028857
15,sc_w,0.027847


In [11]:
test_df = pd.read_csv("test.csv")
test_df = test_df.drop("id", axis=1) # Drop the 'id' column

test_predictions = best_rf.predict(test_df)
test_predictions[:10]

array([3, 3, 2, 3, 1, 3, 3, 1, 3, 0])