## Importing all neccessary files and loading the dataset

In [20]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re # import regex
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Loading the dataset
dataset = pd.read_csv('Sport car price.csv')

print(dataset.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Car Make                 1007 non-null   object
 1   Car Model                1007 non-null   object
 2   Year                     1007 non-null   int64 
 3   Engine Size (L)          997 non-null    object
 4   Horsepower               1007 non-null   object
 5   Torque (lb-ft)           1004 non-null   object
 6   0-60 MPH Time (seconds)  1007 non-null   object
 7   Price (in USD)           1007 non-null   object
dtypes: int64(1), object(7)
memory usage: 63.1+ KB
None


## Preprocessing the data

1. dataset.iloc[:, :-1].values selects all the columns of the dataset except for the last column (which is assumed to be the dependent variable). The iloc method is used to select rows and columns by integer position. [:, :-1] means to select all rows and all columns except for the last one.

2. .values is used to convert the resulting DataFrame object into a NumPy array

3. dataset.iloc[:, -1].values selects only the last column of the dataset, which is assumed to be the dependent variable.


## Observations :-

only columns engine size and torque have NaN values, repair needed, 2 options :-  
1. drop those rows 
2. imputing the missing values with the mean, median, or mode of the respective columns

### Note :-

Whether to drop rows with missing values or impute them depends on the amount of missing data, the percentage of missing data, the reasons for missing data, and the impact of missing data on the analysis.In the case of predicting the price of cars, it is generally not recommended to drop rows with missing values, as this can significantly reduce the size of the dataset and may introduce bias into the analysis. Imputing the missing values with the mean or median of the respective columns is a common approach and can help to preserve the sample size and reduce the bias in the analysis. However, it is important to carefully consider the potential impact of imputed values on the accuracy and interpretability of the analysis.

## My work

I have used option 2, computing mean of values of particular column and replacing it with the NaN, only work on X set of data not Y.  

1. Torque column has all integer related values or NaN hence no problem to update.
2. Engine Size needs more rigourus work since it has both string and double values.

## remove repetitive data

In [21]:
# Remove duplicate rows based on all columns
print("Before duplicates were removed, no of rows = ", len(dataset), "\n")
dataset = dataset.drop_duplicates()
print(dataset)
print("\nAfter duplicates were removed, no of rows = ", len(dataset))

Before duplicates were removed, no of rows =  1007 

         Car Make Car Model  Year Engine Size (L) Horsepower Torque (lb-ft)  \
0         Porsche       911  2022               3        379            331   
1     Lamborghini   Huracan  2021             5.2        630            443   
2         Ferrari   488 GTB  2022             3.9        661            561   
3            Audi        R8  2022             5.2        562            406   
4         McLaren      720S  2021               4        710            568   
...           ...       ...   ...             ...        ...            ...   
999        Nissan      370Z  2021             3.7        332            270   
1002   Koenigsegg     Jesko  2022               5       1280           1106   
1003        Lotus     Evija  2021  Electric Motor       1972           1254   
1005       Pagani    Huayra  2021               6        764            738   
1006        Rimac    Nevera  2021  Electric Motor       1888           1696   

## check for NaN data

In [22]:
#counting number of missing values
missing_values = pd.DataFrame(dataset).isnull().sum().sum()
print("\n\nNumber of missing values in dataset:", missing_values)
# Display all rows with missing values NaN
missing_rows = dataset[dataset.isnull().any(axis=1)]
print("Rows with missing values:\n", missing_rows)

# Splitting the attributes into independent and dependent attributes
X = dataset.iloc[:, :-1].values # all are independent attributes except last
Y = dataset.iloc[:, -1].values # dependent attribute is the price i.e. the last column


print(X,Y)



Number of missing values in dataset: 13
Rows with missing values:
      Car Make       Car Model  Year Engine Size (L) Horsepower Torque (lb-ft)  \
168     Rimac           C_Two  2022             NaN       1914           1696   
171     Tesla   Model S Plaid  2021             NaN       1020           1050   
222   Porsche  Taycan Turbo S  2021             NaN        750            774   
247     Tesla   Model S Plaid  2022             NaN       1020           1050   
387     Rimac           C_Two  2022             NaN       1888           1696   
389     Tesla        Roadster  2022             NaN     10000+              0   
642     Tesla   Model S Plaid  2021        Electric       1020            NaN   
686     Rimac           C_Two  2022             NaN       1914           1696   
697     Lotus           Evija  2022             NaN       1972           1254   
752   Porsche          Taycan  2022             NaN        469            479   
878  Maserati     GranTurismo  2021     

## remove all 13 NaN available rows

In [23]:
dataset=dataset.dropna()
dataset

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
0,Porsche,911,2022,3,379,331,4,101200
1,Lamborghini,Huracan,2021,5.2,630,443,2.8,274390
2,Ferrari,488 GTB,2022,3.9,661,561,3,333750
3,Audi,R8,2022,5.2,562,406,3.2,142700
4,McLaren,720S,2021,4,710,568,2.7,298000
...,...,...,...,...,...,...,...,...
999,Nissan,370Z,2021,3.7,332,270,5.1,30090
1002,Koenigsegg,Jesko,2022,5,1280,1106,2.5,3000000
1003,Lotus,Evija,2021,Electric Motor,1972,1254,2,2000000
1005,Pagani,Huayra,2021,6,764,738,3,2600000


## repair torque

In [24]:
# Replace blank values with NaN
dataset['Torque (lb-ft)'].replace('', np.nan, inplace=True)

# Replace hyphen values with NaN
dataset['Torque (lb-ft)'].replace('-', np.nan, inplace=True)

# Replace values like '10,000+' with 10000
regex = r'(\d{1,3}(,\d{3})?)(\+)?' # match digits, optionally separated by commas, followed by a plus sign
dataset['Torque (lb-ft)'] = dataset['Torque (lb-ft)'].apply(lambda x: int(re.sub(',', '', re.sub('\+', '', re.sub(regex, r'\1', x)))) if isinstance(x, str) else x)

# Compute the mean of the "Torque (lb-ft)" column
torque_mean = dataset['Torque (lb-ft)'].mean()

# Replace NaN values with the mean of the column
dataset['Torque (lb-ft)'].fillna(torque_mean, inplace=True)


print(dataset['Torque (lb-ft)'])


0        331.0
1        443.0
2        561.0
3        406.0
4        568.0
         ...  
999      270.0
1002    1106.0
1003    1254.0
1005     738.0
1006    1696.0
Name: Torque (lb-ft), Length: 707, dtype: float64


## repair mph time

In [25]:
# Replace values like '< 1.9' with just 1.9
regex = r'< (\d+\.\d+)' # match '< ' followed by a number with decimal point
dataset['0-60 MPH Time (seconds)'] = dataset['0-60 MPH Time (seconds)'].apply(lambda x: float(re.sub(regex, r'\1', x)) if isinstance(x, str) else x)

print(dataset['0-60 MPH Time (seconds)'])

0       4.00
1       2.80
2       3.00
3       3.20
4       2.70
        ... 
999     5.10
1002    2.50
1003    2.00
1005    3.00
1006    1.85
Name: 0-60 MPH Time (seconds), Length: 707, dtype: float64


## repair Horsepower

In [26]:
# update values containing , and + 
# Replace values like '10,000+' with 10000
regex = r'(\d{1,3}(,\d{3})?)(\+)?' # match digits, optionally separated by commas, followed by a plus sign
dataset['Horsepower'] = dataset['Horsepower'].apply(lambda x: int(re.sub(',', '', re.sub('\+', '', re.sub(regex, r'\1', x)))) if isinstance(x, str) else x)


print(dataset['Horsepower'])

0        379
1        630
2        661
3        562
4        710
        ... 
999      332
1002    1280
1003    1972
1005     764
1006    1888
Name: Horsepower, Length: 707, dtype: int64


## repair Engine size

In [27]:
for i, row in dataset.iterrows():
    value = row['Engine Size (L)']

    if isinstance(value, float) or isinstance(value, int):
        dataset.at[i, 'Engine Size (L)'] = float(value)
        continue
    elif value == 'Hybrid':
        # extract the numerical value from the string, if present
        num_val = re.findall(r'\d+\.\d+|\d+', value)
        if num_val:
            dataset.at[i, 'Engine Size (L)'] = float(num_val[0])
        else:
            dataset.at[i, 'Engine Size (L)'] = 0.0
    elif '-' in value or 'Electric' in value or pd.isna(value):
        dataset.at[i, 'Engine Size (L)'] = 0.0
    else:
        # remove non-numeric characters before converting to float
        value = re.sub('[^0-9\.]', '', value)
        dataset.at[i, 'Engine Size (L)'] = float(value)

# Convert all NaN values in the "Engine Size (L)" column to 0.0
dataset['Engine Size (L)'] = dataset['Engine Size (L)'].fillna(0.0)

print(dataset['Engine Size (L)'])


0       3.0
1       5.2
2       3.9
3       5.2
4       4.0
       ... 
999     3.7
1002    5.0
1003    0.0
1005    6.0
1006    0.0
Name: Engine Size (L), Length: 707, dtype: float64


## covert price column to appropriate type by removing commas

In [28]:
regex = r'(\d{1,3}(,\d{3})?)(\+)?' # match digits, optionally separated by commas, followed by a plus sign
dataset['Price (in USD)'] = dataset['Price (in USD)'].apply(lambda x: int(re.sub(',', '', re.sub('\+', '', re.sub(regex, r'\1', x)))) if isinstance(x, str) else x)

print(dataset['Price (in USD)'])

0        101200
1        274390
2        333750
3        142700
4        298000
         ...   
999       30090
1002    3000000
1003    2000000
1005    2600000
1006    2400000
Name: Price (in USD), Length: 707, dtype: int64


## Car make and model update

In the case of columns 'car maker' and 'car model' with repeated string values, it is ideal to encode them into numerical values using techniques such as Label Encoding or One-Hot Encoding.

Label Encoding involves assigning a numerical label to each unique category in the column, and One-Hot Encoding creates a new binary column for each unique category in the column.

Using One-Hot Encoding, the same column could be transformed into multiple binary columns.

Google to know more...


In [29]:
print("Distinct car makers : ",len(set(dataset['Car Make'])))
print("Distinct car models : ",len(set(dataset['Car Model'])))

Distinct car makers :  38
Distinct car models :  176


## Label encoding used
This will replace the string values in the Car Make and Car Model columns with their corresponding integer codes. The advantage of using label encoding is that it preserves the order of the original values, and can be easily reversed if needed.


In [30]:

# Label encode Car Make column
le_make = LabelEncoder()
dataset['Car Make'] = le_make.fit_transform(dataset['Car Make'])

# Label encode Car Model column
le_model = LabelEncoder()
dataset['Car Model'] = le_model.fit_transform(dataset['Car Model'])

print(dataset['Car Make'], dataset['Car Model'],sep="\n\n")

0       28
1       16
2       11
3        5
4       21
        ..
999     24
1002    15
1003    18
1005    25
1006    29
Name: Car Make, Length: 707, dtype: int32

0        18
1        98
2         5
3       130
4        15
       ... 
999       1
1002     99
1003     76
1005     95
1006    122
Name: Car Model, Length: 707, dtype: int32


## final check of all datatypes of columns in dataset

In [31]:

print(dataset.dtypes)

missing_values = pd.DataFrame(dataset).isnull().sum().sum()
print("\n\nNumber of missing values in dataset:", missing_values)
# Display all rows with missing values NaN
missing_rows = dataset[dataset.isnull().any(axis=1)]
print("Rows with missing values:\n", missing_rows)

Car Make                     int32
Car Model                    int32
Year                         int64
Engine Size (L)            float64
Horsepower                   int64
Torque (lb-ft)             float64
0-60 MPH Time (seconds)    float64
Price (in USD)               int64
dtype: object


Number of missing values in dataset: 0
Rows with missing values:
 Empty DataFrame
Columns: [Car Make, Car Model, Year, Engine Size (L), Horsepower, Torque (lb-ft), 0-60 MPH Time (seconds), Price (in USD)]
Index: []


## Train test split

To train the data, you first need to split your dataset into training and testing sets. The training set is used to train the model, while the testing set is used to evaluate its performance on unseen data. You can use the train_test_split() function from scikit-learn library to split your dataset into training and testing sets.

This will split your data into training and testing sets, where test_size specifies the proportion of the data to be used for testing (in this case, 20% of the data), and random_state is used to ensure that the split is reproducible.

X_train and y_train will contain the training data, which will be used to fit the model, while X_test and y_test will contain the testing data, which will be used to evaluate the performance of the model.

During training, the algorithm will use X_train and y_train to learn the patterns in the data and fit the model. Once the model is trained, X_test will be used to make predictions, and y_test will be used to evaluate the performance of the model.

The amount of data used for training and testing depends on the test_size parameter.generally, it's recommended to use a test size of around 20-30% to ensure that the model is well-evaluated on unseen data.

In [32]:
# Splitting the attributes into independent and dependent attributes
# X = dataset.iloc[:, :-1].values # all are independent attributes except last
X = dataset.iloc[:, [0, 1, 4, 6]].values # avoidng engine size
Y = dataset.iloc[:, -1].values # dependent attribute is the price i.e. the last column

# split data into training and testing sets 80 - 20 size
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print(X_train, y_train, X_test, y_test, sep="\n\n")



[[  9.   52.  650.    3.5]
 [  7.   69.  650.    3.5]
 [ 13.   79.  296.    5.4]
 ...
 [ 21.   86.  612.    3.1]
 [ 23.   31.  577.    3.5]
 [  5.  146.  591.    3.5]]

[  69000  300000   61600  221400   99990   62000  208000   61500  518000
   92950   75400   38170  346300 2600000   84595  215500   75000  146000
   81250 2750000  220300  225000  517770   59900  500000  157500  126190
  417650   52915  330000  417826   45000 2600000   67000  162000   68000
  417826  500000   96950   85000  132400   64995  210000  222630  100000
  276550  573966  103500   64695 2600000 3000000   45790   69900  193000
  123500  207825  211000   71800  103200  218750   71800   61000  228000
 1050000   51000  100200  103200  507000   76450  218009 2800000 3000000
  500000  625000   63700   67000   60695  202500 2400000  245000   80190
   92950  307820 2800000  118500  104200   30090  417826  141990  340000
  517000  155000   47820  256500 3000000   96950   76950  105000   71000
  213195  120000  500000   6

## Training  a model based on Linear regression



In [33]:
# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = lr_model.predict(X_test)

# Evaluate the model using metrics
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('R-squared score:', r2_score(y_test, y_pred))

Mean Squared Error: 340027449763.6413
Mean Absolute Error: 363378.49508805154
R-squared score: 0.41143792848950367


## Training model based on RandomForestRegression

In [34]:
# fit the model
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)
# make predictions
yhat = model.predict(X_test)
# evaluate predictions
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, yhat)
print('Mean Absolute Error : %.3f' % mae)
print('R-squared score:', r2_score(y_test, y_pred))

Mean Squared Error: 340027449763.6413
Mean Absolute Error : 75868.580
R-squared score: 0.41143792848950367


## Use XGB Regressor

In [35]:
# import xgboost
import xgboost as xgb

# fit the model
model = xgb.XGBRegressor(random_state=1)
model.fit(X_train, y_train)

# make predictions
y_pred3 = model.predict(X_test)
threshold = 0.5
y_pred3_binary = (y_pred3 >= threshold).astype(int)

# evaluate predictions
print('Mean Squared Error:', mean_squared_error(y_test, y_pred3_binary))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred3_binary))
print('R-squared score:', r2_score(y_test, y_pred3_binary))


Mean Squared Error: 707662995907.6478
Mean Absolute Error: 360468.1267605634
R-squared score: -0.22491169195971583
