# Importing Libraries

In [1]:
import pandas as pd

In [2]:
import sklearn
print(sklearn.__version__)

1.2.2


## Importing Datasets

In [3]:
import os

# Get the absolute path of the current working directory
current_directory = os.getcwd()

# Construct the relative path to your data folder and the Excel file
relative_path = 'data/laptop_data.xlsx' ## NOTE HERE I TRIED WITH OUR ALREADY CLEANED DATA BUT REALISTICALLY, WE REMOVE OUTLIERS FROM OUR DATA WHEN THEY ARE NOT YET CLEANED UP

# Combine the current directory with the relative path to get the absolute path to your Excel file
file_path = os.path.join(current_directory, relative_path)

# Read the Excel file using the constructed file path
dataset = pd.read_excel(file_path)

In [4]:
dataset.shape

(254, 16)

## Select the important columns necessary for training/building our model

In [5]:
dataset = dataset[['BRAND', 'CPU BRAND', 'CPU CORE', 'CPU GENERATION', 'CPU FAMILY', 'RAM SIZE', 'RAM(DDR) TYPE', 'DISK TYPE', 'SSD SIZE',
                   'HDD SIZE', 'GPU BRAND', 'GPU TYPE', 'SCREEN SIZE', 'SCREEN RESOLUTION', 'STATE', 'PRICE']]

## Convert the DataType of categorical datas

In [6]:
## CATEGORICAL
dataset["BRAND"] = dataset["BRAND"].astype("str")
dataset["CPU BRAND"] = dataset["CPU BRAND"].astype("str")
dataset["CPU CORE"] = dataset["CPU CORE"].astype("str")
dataset["CPU FAMILY"] = dataset["CPU FAMILY"].astype("str")
dataset["DISK TYPE"] = dataset["DISK TYPE"].astype("str")
dataset["GPU BRAND"] = dataset["GPU BRAND"].astype("str")
dataset["GPU TYPE"] = dataset["GPU TYPE"].astype("str")
dataset["SCREEN RESOLUTION"] = dataset["SCREEN RESOLUTION"].astype("str")
dataset["STATE"] = dataset["STATE"].astype("str")

In [7]:
type(dataset)

pandas.core.frame.DataFrame

In [8]:
dataset.values

array([['ACER', 'Intel', 'Core i3', ..., 'HD', 'USED', 79000],
       ['ACER', 'Intel', 'Core i3', ..., 'HD', 'USED', 79000],
       ['ACER', 'Intel', 'Core i3', ..., 'FHD', 'USED', 88000],
       ...,
       ['LENOVO', 'AMD', 'Ryzen 7', ..., 'FHD', 'USED', 112000],
       ['LENOVO', 'AMD', 'Ryzen 7', ..., 'FHD', 'USED', 135000],
       ['LENOVO', 'AMD', 'Ryzen 7', ..., 'FHD', 'USED', 135000]],
      dtype=object)

## Selecting the X and y columns

- **X**: represent our FEATURES that our model will learn from

- **y**: represent the Label column our model will train with

In [9]:
X = dataset.drop('PRICE', axis = 'columns')

In [10]:
X.shape

(254, 15)

In [11]:
y = dataset.PRICE

In [12]:
y = y.values.reshape((254,1))
y.shape

(254, 1)

## Splitting data into Train and Test sets

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [15]:
categorical_data = dataset[['BRAND', 'CPU BRAND', 'CPU CORE', 'CPU FAMILY', 'DISK TYPE',
               'GPU BRAND', 'GPU TYPE', 'SCREEN RESOLUTION', 'STATE']]

In [16]:
categorical_columns = ['BRAND', 'CPU BRAND', 'CPU CORE', 'CPU FAMILY', 'DISK TYPE',
               'GPU BRAND', 'GPU TYPE', 'SCREEN RESOLUTION', 'STATE']

In [17]:
numerical_data = dataset[['CPU GENERATION', 'RAM SIZE', 'RAM(DDR) TYPE', 'SSD SIZE', 'HDD SIZE', 'SCREEN SIZE']]

In [18]:
numerical_columns = ['CPU GENERATION', 'RAM SIZE', 'RAM(DDR) TYPE', 'SSD SIZE', 'HDD SIZE', 'SCREEN SIZE']

## Importing Libraries

In [19]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

## Normalization and Standardization

In [20]:
num_pipeline = Pipeline([
    ("MinMax_Scaler", MinMaxScaler()),
    ('Standard_Scaler', StandardScaler())
])

## ColumnTransformer and OneHotEncoding

In [21]:
column_trans = ColumnTransformer(transformers = [
    ("numerical", num_pipeline, numerical_columns),
    ("categorical", OneHotEncoder(sparse = False, handle_unknown='ignore'), categorical_columns),
], remainder = 'passthrough')

## Model Building

In [22]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

In [23]:
from time import time

from sklearn.neighbors import KNeighborsRegressor

from sklearn.neighbors import RadiusNeighborsRegressor

In [24]:
regressors = [
    KNeighborsRegressor(),
    RadiusNeighborsRegressor()
]

In [25]:
# head = 10
for model in regressors:    # [:head]:
    start = time()
    
    
    pipe = Pipeline([
    ('step1', column_trans),
    ('step2', model)
    ])
    
    
    pipe.fit(X_train, y_train)


    train_time = time() - start
    start = time()
    predict_time = time()-start
    
    y_pred = pipe.predict(X_test)
    
    print(model)
    print("\t Training Time: %0.3fs" % train_time)
    print("\t Prediction Time: %0.3fs" % predict_time)
    print("\t Explained Variance Score:", explained_variance_score(y_test, y_pred))
    print("\t Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
    print("\t Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("\t Root Mean Squared Error:", mean_squared_error(y_test, y_pred, squared = False))
    print("\t R2 Score:", r2_score(y_test, y_pred))
    print()

KNeighborsRegressor()
	 Training Time: 0.009s
	 Prediction Time: 0.000s
	 Explained Variance Score: 0.7373424646119409
	 Mean Absolute Error: 16624.313725490196
	 Mean Squared Error: 463333552.9411765
	 Root Mean Squared Error: 21525.184155801697
	 R2 Score: 0.7313496742271093

RadiusNeighborsRegressor()
	 Training Time: 0.004s
	 Prediction Time: 0.000s
	 Explained Variance Score: -2.8897990702139094
	 Mean Absolute Error: 73293.46405228759
	 Mean Squared Error: 11282893267.973858
	 Root Mean Squared Error: 106220.96435249427
	 R2 Score: -5.542053630393479


  multiarray.copyto(res, fill_value, casting='unsafe')
