In [1]:
import pandas as pd

df = pd.read_csv("data.csv")

In [3]:
df = df[["Motorcycle", "Color", "Year", "Mileage", "Price"]]
df.head()

Unnamed: 0,Motorcycle,Color,Year,Mileage,Price
0,CLICK125I,white,2560.0,2592.655275,34900.0
1,ZOOMER-X,white,2558.0,2592.655275,33900.0
2,PCX150,black,2555.0,2592.655275,39900.0
3,PCX150,red,2563.0,177.0,88900.0
4,CLICK125I,black,2564.0,6221.0,48900.0


In [4]:
from sklearn.preprocessing import LabelEncoder
le_Color = LabelEncoder()
df['Color'] = le_Color.fit_transform(df['Color'])
df["Color"].unique()

array([10,  1,  9,  5,  3,  7,  2,  4,  6, 11,  0,  8])

In [5]:
le_Motorcycle = LabelEncoder()
df['Motorcycle'] = le_Motorcycle.fit_transform(df['Motorcycle'])
df["Motorcycle"].unique()

array([ 6, 42, 28, 33, 17, 25,  0, 31, 24, 16, 18,  4, 39, 11, 30, 19, 29,
        7, 10,  5, 21, 15, 23, 12, 27, 22, 13, 26,  2, 32, 36, 14, 20, 38,
        1,  3,  8,  9, 40, 35, 37, 41, 34])

In [6]:
le_Year = LabelEncoder()
df['Year'] = le_Year.fit_transform(df['Year'])
df["Year"].unique()

array([17, 15, 12, 20, 21, 14, 19, 16, 18, 11, 22, 13, 10,  5,  7,  8,  9,
        6,  4,  2,  0,  1,  3], dtype=int64)

In [7]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [8]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)

In [9]:
y_pred = linear_reg.predict(X)

In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))

In [11]:
error

12521.418832110196

In [12]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)

In [13]:
y_pred = dec_tree_reg.predict(X)

In [14]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$2,084.14


In [15]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y.values)

In [16]:
y_pred = random_forest_reg.predict(X)

In [17]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$2,332.40


In [18]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X, y.values)

In [19]:
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$2,084.14


In [20]:
X

Unnamed: 0,Motorcycle,Color,Year,Mileage
0,6,10,17,2592.655275
1,42,10,15,2592.655275
2,28,1,12,2592.655275
3,28,9,20,177.000000
4,6,1,21,6221.000000
...,...,...,...,...
1983,41,0,6,2592.655275
1984,38,7,11,2592.655275
1985,39,9,5,2592.655275
1986,37,9,6,2592.655275


In [21]:
X = np.array([["CLICK125I", 'white', 2560.0, 3000.0]])
X


array([['CLICK125I', 'white', '2560.0', '3000.0']], dtype='<U32')

In [22]:
X[:, 0] = le_Motorcycle.transform(X[:,0])
X[:, 1] = le_Color.transform(X[:,1])
X[:, 2] = le_Year.transform(X[:,2])
X = X.astype(float)
X

array([[   6.,   10.,   17., 3000.]])

In [23]:
y_pred = regressor.predict(X)
y_pred



array([35075.])

In [24]:
import pickle

In [25]:
data = {"model": regressor, "le_Motorcycle": le_Motorcycle, "le_Color": le_Color, "le_Year": le_Year}
with open('motorcycle_saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [26]:
with open('motorcycle_saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_Motorcycle = data["le_Motorcycle"]
le_Color = data["le_Color"]
le_Year = data["le_Year"]

In [27]:
y_pred = regressor_loaded.predict(X)
y_pred



array([35075.])