In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split

# Encoding Categorical variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Regressors
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Evaluation metrics
from sklearn.metrics import r2_score

# Data loading
import pandas as pd

In [None]:
df = pd.read_csv(r'F:\GUVI_DATA_SCIENCE\Project\CarDekho-Used-Car-Price-Prediction\Datasets\Final\processed_final_data.csv')

In [None]:
df.head()

##### *Train test split*

In [None]:
x = df.drop(['price', 'location'], axis = 1)
y  = df['price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2)

In [None]:
x_train.shape, x_test.shape

##### *Encoding categorical variables*

In [None]:
transformer = ColumnTransformer([('trans1', OneHotEncoder(sparse_output = False, handle_unknown = 'error', drop = 'first'), [0,2,3,4,7,14,15,16,17])], remainder = 'passthrough')

In [None]:
x_train_transformed = transformer.fit_transform(x_train)

In [None]:
x_test_transformed = transformer.transform(x_test)

##### *Model training*

In [None]:
def model_training(model_name, x_train, y_train):
    if model_name == 'DecisionTreeRegressor':
        model = DecisionTreeRegressor(random_state = 42)


    elif model_name == 'RandomForestRegressor':
        model = RandomForestRegressor(random_state = 42)


    elif model_name == 'GradientBoostingRegressor':
        model = GradientBoostingRegressor(random_state = 42)

    model.fit(x_train, y_train)

    return model

##### **DecisionTreeRegressor**

In [None]:
dtr = model_training('DecisionTreeRegressor', x_train_transformed, y_train)

y_train_pred_1 = dtr.predict(x_train_transformed)
y_test_pred_1 = dtr.predict(x_test_transformed)

##### **RandomForestRegressor**

In [None]:
rfr = model_training('RandomForestRegressor', x_train_transformed, y_train)

y_train_pred_2 = rfr.predict(x_train_transformed)
y_test_pred_2 = rfr.predict(x_test_transformed)

##### **GradientBoostingRegressor**

In [None]:
gbr = model_training('GradientBoostingRegressor', x_train_transformed, y_train)

y_train_pred_3 = gbr.predict(x_train_transformed)
y_test_pred_3 = gbr.predict(x_test_transformed)

##### **Evaluation**

In [None]:
def evalutaion_metrics(y, y_pred):
    r2 = r2_score(y, y_pred)

    return f'{round(r2*100)}%'

##### **DecisionTreeRegressor**

In [None]:
r2_score_train_1 = evalutaion_metrics(y_train, y_train_pred_1)
print('The r2 score of training set :', r2_score_train_1)

r2_score_test_1 = evalutaion_metrics(y_test, y_test_pred_1)
print('The r2 score of testing set :', r2_score_test_1)

##### **RandomForestRegressor**

In [None]:
r2_score_train_2 = evalutaion_metrics(y_train, y_train_pred_2)
print('The r2 score of training set :', r2_score_train_2)

r2_score_test_2 = evalutaion_metrics(y_test, y_test_pred_2)
print('The r2 score of testing set :', r2_score_test_2)

##### **GradientBoostingRegressor**

In [None]:
r2_score_train_3 = evalutaion_metrics(y_train, y_train_pred_3)
print('The r2 score of training set :', r2_score_train_3)

r2_score_test_3 = evalutaion_metrics(y_test, y_test_pred_3)
print('The r2 score of testing set :', r2_score_test_3)

- ##### **Random forest regressor is interpreted to have a r2 score of 98% on training data and r2 score of 85% on testing data**

##### **User input prediction**

In [None]:
manufacturer = 'Maruti'
model_year = 2020
body_type = 'Hatchback'
transmission_type = 'Manual'
fuel_type = 'Petrol'
total_kms = 120000
total_owners = 3
insurance_type = 'Third Party insurance'
mileage = 23.1
engine = 998
torque = 90
seats = 5
number_of_cylinders = 3
valves_per_cylinder = 4
super_charger = 'No'
turbo_charger = 'No'
drive_type = 'FWD'
steering_type = 'Power'
cargo_volume = 500

df = pd.DataFrame(data = [[manufacturer, model_year, body_type,	transmission_type, fuel_type, total_kms, total_owners, insurance_type, mileage, engine, torque,seats, number_of_cylinders, valves_per_cylinder, super_charger, turbo_charger, drive_type, steering_type, cargo_volume]],
                  columns = ['manufacturer', 'model_year', 'body_type', 'transmission_type', 'fuel_type', 'total_kms', 'total_owners', 'insurance_type', 'mileage(kmpl)', 'engine(CC)', 'torque(nm)', 'seats', 'number_of_cylinders', 'valves_per_cylinder', 'super_charger', 'turbo_charger', 'drive_type', 'steering_type', 'cargo_volume'])

df

In [None]:
user_df_transformed = transformer.transform(df)
user_df_transformed

In [None]:
user_prediction = rfr.predict(user_df_transformed)
print(user_prediction)

##### **Saving the model object and transformer object**

In [None]:
import pickle

# Model Object
pickle.dump(rfr, open(r'F:\GUVI_DATA_SCIENCE\Project\CarDekho-Used-Car-Price-Prediction\Artifacts\model.pkl', 'wb'))

In [None]:
# transformer object
pickle.dump(transformer, open(r'F:\GUVI_DATA_SCIENCE\Project\CarDekho-Used-Car-Price-Prediction\Artifacts\transformer.pkl', 'wb'))