In [None]:
#Import the important libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
import matplotlib
warnings.filterwarnings('ignore')

In [None]:
#Load the data from path that save in computer.
df = pd.read_csv('data/car_price_dataset.csv')

In [None]:
df.rename(columns = {'name':'brand' 
                     }, inplace = True)

In [None]:
df.columns

In [None]:
#For the feature owner, map First owner to 1, ..., Test Drive Car to 5
from sklearn.preprocessing import LabelEncoder
label_mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
}

categorical_data = df['owner']

df['owner'] = [label_mapping[label] for label in categorical_data]

In [None]:
#For the feature fuel, remove all rows with CNG and LPG because CNG and LPG use a different mileage system 
# i.e., km/kg which is different from kmfeaturepl for Diesel and Petrol
df = df[~df['fuel'].isin(['CNG', 'LPG'])]

In [None]:
#For the feature mileage, remove “kmpl” and convert the column to numerical type (e.g., float).
# Extract numeric mileage values by splitting and converting to float
df['mileage'] = df['mileage'].str.split().str[0].astype(float)

In [None]:
# Remove "CC" and convert to float
df['engine'] = df['engine'].str.replace(' CC', '').astype(float)

In [None]:
# Remove " bhp" and convert to float, handling N/A values
df['max_power'] = df['max_power'].str.replace(' bhp', '').astype(float)

In [None]:
# Extract the first word and update the column
df['brand'] = df['brand'].str.split().str[0].astype(str)

In [None]:
# Drop the 'torque' feature
df = df.drop(columns=['torque'])

In [None]:
# Remove rows with 'Test Drive Cars' in the 'make' column
df = df[df['owner'] != 5]

In [None]:
df['log_selling_price'] = np.log(df['selling_price'])

In [None]:
#Check again
df.head(5)

In [None]:
#x is our strong features
X = df[['max_power','engine','mileage']]

#y is simply the life expectancy col
y = df["log_selling_price"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42) #It can be 0.2-0.4 for test_size

Filling missing value

In [None]:
#let's fill the training set first!

X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_train['engine'].fillna(X_train['engine'].median(), inplace=True)
X_train['mileage'].fillna(X_train['mileage'].median(), inplace=True)


In [None]:
#let's fill the testing set with the training distribution first!

X_test['max_power'].fillna(X_train['max_power'].median(), inplace=True)
X_test['engine'].fillna(X_train['engine'].median(), inplace=True)
X_test['mileage'].fillna(X_train['mileage'].median(), inplace=True)


In [None]:
#same for y
y_train.fillna(y_train.median(), inplace=True)
y_test.fillna(y_train.median(), inplace=True)

In [None]:
# Create a dictionary of columns.
col_dict = {'max_power':1,'mileage':2}

# Detect outliers in each variable using box plots.
plt.figure(figsize=(20,30))

for variable,i in col_dict.items():
                     plt.subplot(5,4,i)
                     plt.boxplot(X_train[variable])
                     plt.title(variable)

plt.show()

In [None]:
def outlier_count(col, data = X_train):
    
    # calculate your 25% quatile and 75% quatile
    q75, q25 = np.percentile(data[col], [75, 25])
    
    # calculate your inter quatile
    iqr = q75 - q25
    
    # min_val and max_val
    min_val = q25 - (iqr*1.5)
    max_val = q75 + (iqr*1.5)
    
    # count number of outliers, which are the data that are less than min_val or more than max_val calculated above
    outlier_count = len(np.where((data[col] > max_val) | (data[col] < min_val))[0])
    
    # calculate the percentage of the outliers
    outlier_percent = round(outlier_count/len(data[col])*100, 2)
    
    if(outlier_count > 0):
        print("\n"+15*'-' + col + 15*'-'+"\n")
        print('Number of outliers: {}'.format(outlier_count))
        print('Percent of data that is outlier: {}%'.format(outlier_percent))

In [None]:
for col in X_train.columns:
    outlier_count(col)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


# feature scaling helps improve reach convergence faster
scaler = StandardScaler() # for standadization use StandardScaler() , for normalization use MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

#x = (x - mean) / std
#why do we want to scale our data before data analysis / machine learning

#allows your machine learning model to catch the pattern/relationship faster
#faster convergence

#how many ways to scale
#standardardization <====current way
# (x - mean) / std
#--> when your data follows normal distribution

#normalization <---another way
# (x - x_min) / (x_max - x_min)
#---> when your data DOES NOT follow normal distribution (e.g., audio, signal, image) We will use nomalization when mean is a bad.

In [None]:
# Let's check shapes of all X_train, X_test, y_train, y_test
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression  #we are using regression models
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Libraries for model evaluation

# models that we will be using, put them in a list
algorithms = [LinearRegression(), SVR(), KNeighborsRegressor(), DecisionTreeRegressor(random_state = 0), 
              RandomForestRegressor(n_estimators = 100, random_state = 0)]

# The names of the models
algorithm_names = ["Linear Regression", "SVR", "KNeighbors Regressor", "Decision-Tree Regressor", "Random-Forest Regressor"]

In [None]:
from sklearn.linear_model import LinearRegression  #we are using regression models
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
yhat = lr.predict(X_test)

print("MSE: ", mean_squared_error(y_test, yhat))
print("r2: ", r2_score(y_test, yhat))

In [None]:
from sklearn.model_selection import KFold, cross_val_score

#lists for keeping mse
train_mse = []
test_mse = []

#defining splits
kfold = KFold(n_splits=5, shuffle=True)

for i, model in enumerate(algorithms):
    scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error') #Higher is better.
    print(f"{algorithm_names[i]} - Score: {scores}; Mean: {scores.mean()}")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'bootstrap': [True], 'max_depth': [5, 10, None],
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}

rf = RandomForestRegressor(random_state = 1)

grid = GridSearchCV(estimator = rf, 
                    param_grid = param_grid, 
                    cv = kfold, 
                    n_jobs = -1, 
                    return_train_score=True, 
                    refit=True,
                    scoring='neg_mean_squared_error')

# Fit your grid_search
grid.fit(X_train, y_train);  #fit means start looping all the possible parameters

In [None]:
grid.best_params_

In [None]:
# Find your grid_search's best score
best_mse = grid.best_score_

In [None]:
best_mse  # ignore the minus because it's neg_mean_squared_error

Testing

In [None]:
yhat = grid.predict(X_test)
mean_squared_error(y_test,yhat)

In [None]:
print(yhat)

In [None]:
y_pred_original = np.exp(yhat)

In [None]:
print(y_pred_original)

In [None]:
import pickle

# save the model to disk
filename = 'model/price_car_prediction.pkl'
pickle.dump(grid, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
#let's try to create one silly example
df[['max_power','engine','mileage','selling_price']].loc[1]

In [None]:
sample = np.array([[100,1218.00,20.14]])

In [None]:
selling_price = loaded_model.predict(sample)
selling_price = np.exp(selling_price)
print(selling_price)