In [4]:
# 27/03/2024
# CSC354 – Assignmen2 – ML – Decision Trees
# Yahya Irfan
# FA21-BSE-044
"""
Download the Used Cars Prices dataset (cars-dataset) from your shared Google Drive folder
Link: http://tinyurl.com/sp24ml
Note: Please open the dataset file first for manual inspection before performing any experiments.
Use this dataset for a regression task using decision trees. Specifically, use a Decision Tree Regressor for 
predicting the price of a car. Similar to Q1, start with a baseline model with default parameters and then try to 
find the optimal parameter settings using both Random and Grid search methods.
"""


import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

df = pd.read_csv("cars-dataset.csv")
print(df.head())

inputs = df.drop('selling_price',axis='columns')
inputs

target = df['selling_price']
# target
fuel_m = LabelEncoder()
seller_type_m = LabelEncoder()
transmission_m = LabelEncoder()
owner_m= LabelEncoder()

inputs['fuel_d']= fuel_m.fit_transform(inputs['fuel'])
inputs['seller_type_d']= seller_type_m.fit_transform(inputs['seller_type'])
inputs['transmission_d']= transmission_m.fit_transform(inputs['transmission'])
inputs['owner_d']= owner_m.fit_transform(inputs['owner'])

inputs = inputs.drop(['fuel','seller_type','transmission','owner'],axis='columns')
print(inputs)

X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)




dt_model = tree.DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train,y_train)



y_pred = dt_model.predict(X_test)

dt_mse = mean_squared_error(y_test, y_pred)
print("Baseline Mean Squared Error:", dt_mse)


# Random Search
random_param_dist = {
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 9],
    'min_samples_leaf': [1, 4, 6],
    'max_features': ['sqrt', 'log2']
}

random_search_tree = RandomizedSearchCV(dt_model, param_distributions=random_param_dist, n_iter=50, cv=5, random_state=42)
random_search_tree.fit(X_train, y_train)
best_random_params = random_search_tree.best_params_
random_search_tree_predictions = random_search_tree.predict(X_test)
random_search_mse = mean_squared_error(y_test, random_search_tree_predictions)
print("Best Random Parameters:",best_random_params)
print("Random Search Decision Tree MSE:", random_search_mse)

# Grid Search
grid_param_grid = {
    'max_depth': [80],
    'min_samples_split': [9],
    'min_samples_leaf': [15],
    'max_features': ['sqrt', 'log2', None]
}

grid_search_tree = GridSearchCV(dt_model, param_grid=grid_param_grid, cv=5)
grid_search_tree.fit(X_train, y_train)
best_grid_params = grid_search_tree.best_params_
grid_search_tree_predictions = grid_search_tree.predict(X_test)
grid_search_mse = mean_squared_error(y_test, grid_search_tree_predictions)
print("Best Grid Parameters:",best_grid_params)
print("Grid Search Decision Tree MSE:", grid_search_mse)

   year  km_driven    fuel seller_type transmission         owner  \
0  2007      70000  Petrol  Individual       Manual   First Owner   
1  2007      50000  Petrol  Individual       Manual   First Owner   
2  2012     100000  Diesel  Individual       Manual   First Owner   
3  2017      46000  Petrol  Individual       Manual   First Owner   
4  2014     141000  Diesel  Individual       Manual  Second Owner   

   selling_price  
0          60000  
1         135000  
2         600000  
3         250000  
4         450000  
      year  km_driven  fuel_d  seller_type_d  transmission_d  owner_d
0     2007      70000       4              1               1        0
1     2007      50000       4              1               1        0
2     2012     100000       1              1               1        0
3     2017      46000       4              1               1        0
4     2014     141000       1              1               1        2
...    ...        ...     ...            ...       