In [22]:
# Libraries
import pandas as pd
import os
import numpy as np
import xgboost as xg
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import sklearn
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# Load the data

In [23]:
df = pd.read_csv(r"C:\Users\krish\Desktop\sample-project\data-1\ford.csv")
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,150,47.1,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,30,57.7,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,20,67.3,1.6
17964,KA,2018,8299,Manual,5007,Petrol,145,57.7,1.2


In [24]:
df.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966 entries, 0 to 17965
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17966 non-null  object 
 1   year          17966 non-null  int64  
 2   price         17966 non-null  int64  
 3   transmission  17966 non-null  object 
 4   mileage       17966 non-null  int64  
 5   fuelType      17966 non-null  object 
 6   tax           17966 non-null  int64  
 7   mpg           17966 non-null  float64
 8   engineSize    17966 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


In [26]:
df.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,17966.0,17966.0,17966.0,17966.0,17966.0,17966.0
mean,2016.86647,12279.534844,23362.608761,113.329456,57.90698,1.350807
std,2.050336,4741.343657,19472.054349,62.012456,10.125696,0.432367
min,1996.0,495.0,1.0,0.0,20.8,0.0
25%,2016.0,8999.0,9987.0,30.0,52.3,1.0
50%,2017.0,11291.0,18242.5,145.0,58.9,1.2
75%,2018.0,15299.0,31060.0,145.0,65.7,1.5
max,2060.0,54995.0,177644.0,580.0,201.8,5.0


In [27]:
# Convert categories into numerical data
print(df["transmission"].unique())
# Convert categories into numerical data
print(df["fuelType"].unique())

['Automatic' 'Manual' 'Semi-Auto']
['Petrol' 'Diesel' 'Hybrid' 'Electric' 'Other']


In [28]:
# Encoding cat to  num
df.replace({'transmission':{'Automatic':0, 'Manual':1, 'Semi-Auto':2}}, inplace=True)
df.replace({'fuelType':{'Petrol':0, 'Diesel':1, 'Hybrid':2, 'Electric':3, 'Other':4}}, inplace=True)

In [29]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,0,15944,0,150,57.7,1.0
1,Focus,2018,14000,1,9083,0,150,57.7,1.0
2,Focus,2017,13000,1,12456,0,150,57.7,1.0
3,Fiesta,2019,17500,1,10460,0,145,40.3,1.5
4,Fiesta,2019,16500,0,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,1,16700,0,150,47.1,1.4
17962,B-MAX,2014,7499,1,40700,0,30,57.7,1.0
17963,Focus,2015,9999,1,7010,1,20,67.3,1.6
17964,KA,2018,8299,1,5007,0,145,57.7,1.2


In [30]:
# Split the data into x & y
x = df.drop(["model", "price"], axis=1)
y = df["price"]

In [31]:
# Standardize the data
scaler = StandardScaler()
scaler.fit(x)

In [32]:
standardized_x = scaler.transform(x)
standardized_x

array([[ 0.06512772, -2.67003231, -0.38099808, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.55286624,  0.04135139, -0.73335899, ...,  0.59135805,
        -0.02044162, -0.81138621],
       [ 0.06512772,  0.04135139, -0.56013157, ...,  0.59135805,
        -0.02044162, -0.81138621],
       ...,
       [-0.91034931,  0.04135139, -0.83982222, ..., -1.50505332,
         0.92766777,  0.57636151],
       [ 0.55286624,  0.04135139, -0.94269045, ...,  0.51072684,
        -0.02044162, -0.34880364],
       [-0.91034931,  0.04135139, -0.94269045, ..., -1.47280084,
        -0.02044162, -0.81138621]])

In [33]:
x = standardized_x
y = df["price"]

In [34]:
# Now train & test the model
#np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                   y,
                                                   test_size=.1,
                                                   random_state=42)

In [35]:
print(x.shape, x_train.shape, x_test.shape)
print(y.shape, y_train.shape, y_test.shape)

(17966, 7) (16169, 7) (1797, 7)
(17966,) (16169,) (1797,)


In [36]:
# Load the model
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
xgb_model.fit(x_train, y_train)

# Model EValuation

In [37]:
# Predict on the training data
train_data = xgb_model.predict(x_train)
# Score on train data
score = metrics.r2_score(y_train, train_data)
# MAE
mae = metrics.mean_absolute_error(y_train, train_data)

In [38]:
print("R2_score:", score)
print("MAE:", mae)

R2_score: 0.9531964580818831
MAE: 740.4128602406732


In [39]:
# Predict on the test data
test_data = xgb_model.predict(x_test)
# Score on train data
score = metrics.r2_score(y_test, test_data)
# MAE
mae = metrics.mean_absolute_error(y_test, test_data)

In [40]:
print("R2_score:", score)
print("MAE:", mae)

R2_score: 0.9116280023596516
MAE: 907.3636198473694


# Make Predictions

In [41]:
# Preditions
input_data = (2019, 1, 10460, 0, 145, 40.3, 1.5)
# cahnge input into numpy array
input_changed = np.array(input_data).reshape(1, -1)
#Standardize the input
#scaler = StandardScaler()
#scaler.fit(input_changed)
std_input = scaler.transform(input_changed)
prediction = xgb_model.predict(std_input)
print(prediction)

print("This car estimation is:", prediction)

[18320.613]
This car estimation is: [18320.613]


In [42]:
# Save the model
import joblib
joblib.dump(xgb_model, 'xgb_model.pkl')
# Save the scaler
joblib.dump(scaler,'scaler.pkl')

['scaler.pkl']