### Importing the Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

### Importing the Dataset

In [4]:
data = pd.read_csv("../datasets/practise/Car Price.csv")

### Encoding Categorical data

In [6]:
from sklearn.preprocessing  import OneHotEncoder

# Define the categorical columns to encode
categorical_columns_1 = ["drivewheel", "cylindernumber", "enginetype", "fuelsystem", "carbody", "CarName"]

# Initialize the OneHotEncoder
encoder_1 = OneHotEncoder(sparse_output=False)  # sparse_output=False for dense array output

# Apply OneHotEncoder only to the selected columns
encoded_features = encoder_1.fit_transform(data[categorical_columns_1])

# Convert the encoded features into a DataFrame with meaningful column names
encoded_df = pd.DataFrame(encoded_features,columns=encoder_1.get_feature_names_out(categorical_columns_1))

# Drop the original categorical columns
data = data.drop(columns=categorical_columns_1)

# Concatenate the one-hot encoded DataFrame with the rest of the data
data = pd.concat([data.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


data

Unnamed: 0,car_ID,price,fueltype,aspiration,doornumber,enginelocation,symboling,carwidth,carheight,wheelbase,...,CarName_volkswagen type 3,CarName_volvo 144ea,CarName_volvo 145e (sw),CarName_volvo 244dl,CarName_volvo 245,CarName_volvo 246,CarName_volvo 264gl,CarName_volvo diesel,CarName_vw dasher,CarName_vw rabbit
0,1,13495.0,gas,std,two,front,3,64.1,48.8,88.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,16500.0,gas,std,two,front,3,64.1,48.8,88.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,16500.0,gas,std,two,front,1,65.5,52.4,94.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,13950.0,gas,std,four,front,2,66.2,54.3,99.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,17450.0,gas,std,four,front,2,66.4,54.3,99.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,16845.0,gas,std,four,front,-1,68.9,55.5,109.1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201,202,19045.0,gas,turbo,four,front,-1,68.8,55.5,109.1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202,203,21485.0,gas,std,four,front,-1,68.9,55.5,109.1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
203,204,22470.0,diesel,turbo,four,front,-1,68.9,55.5,109.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
#Encoding binary columns
from sklearn.preprocessing import LabelEncoder

encoder_2 = LabelEncoder()
categorical_columns_2 = ["fueltype","aspiration", "doornumber", "enginelocation"]

for col in categorical_columns_2:
    data[col] = encoder_2.fit_transform(data[col])

### Splitting the dataset into train and test set

In [32]:
x = data.iloc[:, 2:-1].values
y = data.iloc[:, 1].values

from sklearn.model_selection import train_test_split

x_train, x_test,y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

### Training the Multiple Linear Regression model and Predicting the Training set

In [41]:
from sklearn.linear_model import LinearRegression

Mul_regressor = LinearRegression()
Mul_regressor.fit(x_train, y_train)

### Predicting the result

In [68]:
y_pred = Mul_regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[ 2.76e+03  6.80e+03]
 [ 8.41e+03  1.58e+04]
 [-1.54e+11  1.52e+04]
 [-1.94e+12  5.15e+03]
 [ 7.89e+03  1.00e+04]
 [ 6.95e+03  1.12e+04]
 [ 8.39e+03  5.39e+03]
 [ 3.13e+02  7.90e+03]
 [ 1.40e+04  1.72e+04]
 [-2.50e+10  6.53e+03]
 [ 5.33e+09  2.10e+04]
 [-2.93e+12  3.14e+04]
 [ 8.10e+03  1.09e+04]
 [ 2.45e+04  1.83e+04]
 [ 1.97e+09  8.92e+03]
 [ 7.72e+03  9.99e+03]
 [ 9.33e+03  9.30e+03]
 [-1.54e+11  1.89e+04]
 [ 9.35e+03  7.90e+03]
 [ 1.06e+04  6.49e+03]
 [ 1.01e+04  9.96e+03]
 [ 7.75e+03  1.56e+04]
 [ 1.24e+04  9.90e+03]
 [ 1.16e+04  1.15e+04]
 [ 1.74e+04  1.60e+04]
 [ 4.58e+03  5.12e+03]
 [ 8.46e+03  6.94e+03]
 [ 5.32e+03  1.67e+04]
 [ 5.22e+03  8.36e+03]
 [-1.04e+10  5.50e+03]
 [ 9.27e+03  7.98e+03]
 [ 9.13e+03  1.23e+04]
 [ 5.36e+03  2.20e+04]
 [ 1.09e+04  8.95e+03]
 [ 7.36e+03  6.85e+03]
 [-8.11e+09  4.13e+04]
 [-1.15e+11  1.16e+04]
 [ 1.60e+04  1.82e+04]
 [-5.62e+08  6.38e+03]
 [-3.90e+11  4.54e+04]
 [-2.02e+13  8.92e+03]]


In [70]:
y_pred = [f"{val:.0f}" for val in y_pred]
comparison_df = pd.DataFrame({
    "Original Pice": y_test,
    "Predicted Price": y_pred
})

comparison_df

Unnamed: 0,Original Pice,Predicted Price
0,6795.0,2757
1,15750.0,8414
2,15250.0,-154306460877
3,5151.0,-1935537985495
4,9995.0,7892
5,11199.0,6948
6,5389.0,8390
7,7898.0,313
8,17199.0,14009
9,6529.0,-25026115277
