In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
import pickle






# # Read the dataset

In [6]:
df = pd.read_csv('laptopprice.csv')
df

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,0 GB,Casual,1 year,No,No,135990,3 stars,0,0
819,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,0 GB,Casual,1 year,No,No,144990,3 stars,0,0
820,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,4 GB,Casual,1 year,No,No,149990,3 stars,0,0
821,ASUS,AMD,Ryzen 9,Not Available,4 GB,DDR4,1024 GB,0 GB,Windows,64-bit,4 GB,Casual,1 year,No,No,142990,3 stars,0,0


# Drop irrelevant columns and preprocess 'rating' column

In [7]:
df['rate'] = df['rating'].apply(lambda x: int(x.split(' ')[0]))
df.drop(['Number of Ratings', 'rating', 'Number of Reviews'], axis=1, inplace=True)

# Separate features and target variable

In [8]:
X = df.drop('Price', axis=1)
y = df['Price']

# Split the data into train and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create a pipeline with OneHotEncoder for categorical features and XGBRegressor as the model

In [10]:
categorical_features = X.select_dtypes(include=['object'])
cp = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown='ignore'))])
pr = ColumnTransformer([("categorical", cp, categorical_features.columns)])
pipe = Pipeline(steps=[('preprocessor', pr), ('regressor', XGBRegressor())])

# Set up the parameter grid for GridSearchCV

In [11]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200],  # Number of estimators for XGBRegressor
    'regressor__learning_rate': [0.01, 0.1, 0.2],  # Learning rate for XGBRegressor
    'regressor__max_depth': [3, 4, 5],  # Maximum depth of the tree
}

# Perform GridSearchCV

In [12]:
grid = GridSearchCV(pipe, param_grid, cv=3)
grid.fit(X_train, y_train)

# Evaluate the best model on the training and test sets

In [13]:
train_score = grid.score(X_train, y_train)
test_score = grid.score(X_test, y_test)

print("Best model:", grid.best_estimator_)
print("Training R^2:", train_score)
print("Test R^2:", test_score)


Best model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['brand', 'processor_brand', 'processor_name', 'processor_gnrtn',
       'ram_gb', 'ram_type', 'ssd', 'hdd', 'os', 'os_bit', 'graphic_card_gb',
       'weight', 'warranty', 'Touchscreen', 'msoffice'],
      dtype='object'))])),
                ('reg...
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=0.2,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                           

# Save the best model using pickle

In [14]:
with open('best_model_xgb.pkl', 'wb') as file:
    pickle.dump(grid.best_estimator_, file)



# Assuming these are the input features in the given order

In [15]:
new_data = pd.DataFrame([['Lenovo', 'Intel', 'Core i3', '10th', '4 GB', 'DDR4', '0 GB', '1024 GB', 'Windows',
                          '64-bit', '0 GB', 'Casual', 'No warranty', 'No', 'No']],
                        columns=['brand', 'processor_brand', 'processor_name', 'processor_gnrtn',
                                 'ram_gb', 'ram_type', 'ssd', 'hdd', 'os', 'os_bit', 'graphic_card_gb',
                                 'weight', 'warranty', 'Touchscreen', 'msoffice'])


# Make sure to pass the new_data through the pipeline's predict function

In [16]:
predicted_price = grid.best_estimator_.predict(new_data)

print("Predicted Price: {:.2f}".format(predicted_price[0]))


Predicted Price: 38462.24
