In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin
import re

In [35]:
class EngineExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
       
        X = X.copy()
        X['Engine'] = X['Engine'].apply(lambda x: float(re.findall(r'\d+\.\d+|\d+', x)[0]))
        return X

In [3]:
data = pd.read_csv('used_cars_UK.csv')

In [5]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,title,Price,Mileage(miles),Registration_Year,Previous Owners,Fuel type,Body type,Engine,Gearbox,Doors,Seats,Emission Class,Service history
0,0,SKODA Fabia,6900,70189,2016,3.0,Diesel,Hatchback,1.4L,Manual,5.0,5.0,Euro 6,
1,1,Vauxhall Corsa,1495,88585,2008,4.0,Petrol,Hatchback,1.2L,Manual,3.0,5.0,Euro 4,Full
2,2,Hyundai i30,949,137000,2011,,Petrol,Hatchback,1.4L,Manual,5.0,5.0,Euro 5,
3,3,MINI Hatch,2395,96731,2010,5.0,Petrol,Hatchback,1.4L,Manual,3.0,4.0,Euro 4,Full
4,4,Vauxhall Corsa,1000,85000,2013,,Diesel,Hatchback,1.3L,Manual,5.0,5.0,Euro 5,
5,5,Hyundai Coupe,800,124196,2007,3.0,Petrol,Coupe,2.0L,Manual,3.0,4.0,Euro 4,
6,6,Ford Focus,798,140599,2008,,Petrol,Hatchback,1.6L,Manual,5.0,5.0,Euro 4,
7,7,Vauxhall Corsa,1995,90000,2009,,Petrol,Hatchback,1.2L,Manual,3.0,5.0,Euro 4,
8,8,Volvo 740,750,225318,1989,,Petrol,Estate,2.3L,Automatic,5.0,,,
9,9,Peugeot 207,1299,87000,2008,5.0,Diesel,Hatchback,1.6L,Manual,5.0,5.0,Euro 4,


In [11]:
data.isna().sum()

Unnamed: 0              0
title                   0
Price                   0
Mileage(miles)          0
Registration_Year       0
Previous Owners      1409
Fuel type               0
Body type               0
Engine                 45
Gearbox                 0
Doors                  25
Seats                  35
Emission Class         87
Service history      3145
dtype: int64

In [13]:
x_train = data[['Mileage(miles)','Registration_Year',]].to_numpy()
y_train = data['Price'].to_numpy()
x_train

array([[ 70189,   2016],
       [ 88585,   2008],
       [137000,   2011],
       ...,
       [139000,   2013],
       [179190,   2007],
       [ 82160,   2013]], dtype=int64)

In [15]:
numerical_features = ['Mileage(miles)', 'Registration_Year', 'Previous Owners', 'Engine', 'Doors', 'Seats']
categorical_features = ['Fuel type', 'Body type', 'Gearbox', 'Emission Class', 'Service history', 'title']

In [17]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [19]:
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])


In [21]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [23]:
X = data.drop(['Price'], axis=1)
y = data['Price']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:

from sklearn import linear_model 

In [43]:
lr = linear_model.LinearRegression()

In [47]:
df_2 =data[['Price','Mileage(miles)','Registration_Year']]
df_2

Unnamed: 0,Price,Mileage(miles),Registration_Year
0,6900,70189,2016
1,1495,88585,2008
2,949,137000,2011
3,2395,96731,2010
4,1000,85000,2013
...,...,...,...
3680,1395,76202,2006
3681,6990,119000,2012
3682,3995,139000,2013
3683,1390,179190,2007


In [49]:
df_2.corr()['Price']

Price                1.000000
Mileage(miles)      -0.500158
Registration_Year    0.723880
Name: Price, dtype: float64

In [51]:
lr.fit(x_train,y_train)

In [53]:
lr.coef_

array([-2.72824039e-02,  5.48850516e+02])

In [55]:
y_prediction = lr.predict(x_train)
y_prediction

array([8377.00759271, 3484.31636234, 3809.99032445, ..., 4853.12654857,
        463.5436391 , 6403.85838743])

In [57]:
df_2['price prediction'] = y_prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['price prediction'] = y_prediction


In [59]:
df_2

Unnamed: 0,Price,Mileage(miles),Registration_Year,price prediction
0,6900,70189,2016,8377.007593
1,1495,88585,2008,3484.316362
2,949,137000,2011,3809.990324
3,2395,96731,2010,4359.774932
4,1000,85000,2013,6326.376360
...,...,...,...,...
3680,1395,76202,2006,2724.453338
3681,6990,119000,2012,4849.924111
3682,3995,139000,2013,4853.126549
3683,1390,179190,2007,463.543639


In [61]:
from sklearn.metrics import r2_score

In [63]:
r2 = r2_score(y_train, y_prediction)
r2

0.5715940315595132