<h1 style="color: brown;">1. Importing Libraries and Configuring Display Options</h1>

In [1]:
# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
from sklearn.model_selection import cross_validate
from itertools import product

## Customizing DataFrame Output Formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

<h1 style="color: brown;">2. Exploring Dataset</h1>

<h2 style="color: purple;">2.1 Loading Used Car Dataset</h2>

In [2]:
df = pd.read_csv('cars_data_clean.csv')

<h2 style="color: purple;">2.2 Displaying Dataset Dimensions</h2> 

In [3]:
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns.')

The data has 37813 rows and 66 columns.


<h2 style="color: purple;">2.3 Displaying All Column Names</h2>

In [4]:
# Get all column names from df
all_column_names = df.columns

# Display columns with numbered list
print("All Columns:")
for idx, col in enumerate(all_column_names, start=1):
    print(f"{idx}. {col}")


All Columns:
1. usedCarSkuId
2. loc
3. myear
4. body
5. transmission
6. fuel
7. km
8. ip
9. images
10. imgCount
11. threesixty
12. dvn
13. oem
14. model
15. variant
16. City
17. listed_price
18. discountValue
19. utype
20. carType
21. top_features
22. comfort_features
23. interior_features
24. exterior_features
25. safety_features
26. Color
27. Engine Type
28. No of Cylinder
29. Valves per Cylinder
30. Valve Configuration
31. Turbo Charger
32. Super Charger
33. Length
34. Width
35. Height
36. Wheel Base
37. Front Tread
38. Rear Tread
39. Kerb Weight
40. Gross Weight
41. Gear Box
42. Drive Type
43. Seats
44. Steering Type
45. Turning Radius
46. Front Brake Type
47. Rear Brake Type
48. Top Speed
49. Acceleration
50. Tyre Type
51. Doors
52. Cargo Volume
53. model_type_new
54. state
55. exterior_color
56. owner_type
57. Fuel Suppy System
58. Compression Ratio
59. Alloy Wheel Size
60. Ground Clearance Unladen
61. Max Power Delivered
62. Max Power At
63. Max Torque Delivered
64. Max Torque A

<h2 style="color: purple;">2.4 Identify Numeric and Non-Numeric Features</h2>  

In [5]:
numeric_column_names = df.select_dtypes(include='number').columns
non_numeric_column_names = df.select_dtypes(exclude='number').columns

# Organize the columns in a dictionary for clear display
columns_dict = {
    "Numeric Columns": list(numeric_column_names),
    "Non-Numeric Columns": list(non_numeric_column_names)
}

# Print dictionary in a readable format with numbered lists and unique value counts
for category, columns in columns_dict.items():
    print(f"{category}:")
    for idx, col in enumerate(columns, start=1):
        unique_values_count = df[col].nunique()  # Count unique values
        print(f"  {idx}. {col} (Unique Values: {unique_values_count})")
    print()  # Add an empty line between categories


Numeric Columns:
  1. myear (Unique Values: 34)
  2. km (Unique Values: 23863)
  3. ip (Unique Values: 2)
  4. imgCount (Unique Values: 54)
  5. listed_price (Unique Values: 6865)
  6. discountValue (Unique Values: 30)
  7. No of Cylinder (Unique Values: 11)
  8. Valves per Cylinder (Unique Values: 7)
  9. Length (Unique Values: 411)
  10. Width (Unique Values: 276)
  11. Height (Unique Values: 300)
  12. Wheel Base (Unique Values: 219)
  13. Front Tread (Unique Values: 156)
  14. Rear Tread (Unique Values: 173)
  15. Kerb Weight (Unique Values: 539)
  16. Gross Weight (Unique Values: 288)
  17. Seats (Unique Values: 11)
  18. Turning Radius (Unique Values: 75)
  19. Top Speed (Unique Values: 190)
  20. Acceleration (Unique Values: 243)
  21. Doors (Unique Values: 5)
  22. Cargo Volume (Unique Values: 203)
  23. Compression Ratio (Unique Values: 61)
  24. Alloy Wheel Size (Unique Values: 11)
  25. Ground Clearance Unladen (Unique Values: 26)
  26. Max Power Delivered (Unique Values: 57

<h2 style="color: purple;">2.5 Displaying First 5 Rows of the Dataset</h2>  

In [6]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,usedCarSkuId,loc,myear,body,transmission,fuel,km,ip,images,imgCount,threesixty,dvn,oem,model,variant,City,listed_price,discountValue,utype,carType,top_features,comfort_features,interior_features,exterior_features,safety_features,Color,Engine Type,No of Cylinder,Valves per Cylinder,Valve Configuration,Turbo Charger,Super Charger,Length,Width,Height,Wheel Base,Front Tread,Rear Tread,Kerb Weight,Gross Weight,Gear Box,Drive Type,Seats,Steering Type,Turning Radius,Front Brake Type,Rear Brake Type,Top Speed,Acceleration,Tyre Type,Doors,Cargo Volume,model_type_new,state,exterior_color,owner_type,Fuel Suppy System,Compression Ratio,Alloy Wheel Size,Ground Clearance Unladen,Max Power Delivered,Max Power At,Max Torque Delivered,Max Torque At,Bore,Stroke
0,7111bf25-97af-47f9-867b-40879190d800,gomti nagar,2016,hatchback,manual,cng,69162,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,15,False,maruti wagon r lxi cng,maruti,maruti wagon r,lxi cng,lucknow,370000,0,dealer,corporate,"['power steering', 'power windows front', 'air...","['power steering', 'power windows front', 'rem...","['air conditioner', 'heater', 'digital odomete...","['adjustable head lights', 'manually adjustabl...","['centeral locking', 'child safety locks', 'da...",silver,k10b engine,3.0,4.0,dohc,False,False,3599.0,1495.0,1700.0,2400.0,1295.0,1290.0,960.0,1350.0,5 speed,fwd,5.0,power,4.6,ventilated disc,drum,137.0,15.9,tubeless,5.0,180.0,used,uttar pradesh,silver,first,,,,,58.16,6200.0,77.0,3500.0,69.0,
1,c309efc1-efaf-4f82-81ad-dcb38eb36665,borivali west,2015,hatchback,manual,cng,45864,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,15,False,maruti celerio green vxi,maruti,maruti celerio,green vxi,mumbai,365000,0,dealer,corporate,"['power steering', 'power windows front', 'air...","['power steering', 'power windows front', 'pow...","['air conditioner', 'heater', 'digital odomete...","['adjustable head lights', 'manually adjustabl...","['centeral locking', 'child safety locks', 'da...",grey,k10b engine,3.0,4.0,dohc,False,False,3600.0,1600.0,1560.0,2425.0,1420.0,1410.0,915.0,1350.0,5 speed,fwd,5.0,power,4.7,ventilated disc,drum,150.0,15.05,tubeless radial,5.0,235.0,used,maharashtra,grey,first,Gasoline Port Injection,11.0,,,58.2,6000.0,78.0,3500.0,73.0,
2,7609f710-0c97-4f00-9a47-9b9284b62d3a,jasola,2015,sedan,manual,cng,81506,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,15,False,honda amaze s plus i-vtec,honda,honda amaze,s plus i-vtec,new delhi,421000,0,dealer,corporate,"['power steering', 'power windows front', 'air...","['power steering', 'power windows front', 'pow...","['air conditioner', 'heater', 'adjustable stee...","['adjustable head lights', 'power adjustable e...","['centeral locking', 'power door locks', 'chil...",silver,i-vtec petrol engine,4.0,4.0,sohc,False,False,3990.0,1680.0,1505.0,2405.0,,,950.0,,5 speed,fwd,5.0,power,4.5,disc,drum,160.0,15.0,tubeless radial,4.0,400.0,used,delhi,silver,second,,,,,86.7,6000.0,109.0,4500.0,,
3,278b76e3-5539-4a5e-ae3e-353a2e3b6d7d,jasola,2013,hatchback,manual,cng,115893,0,[{'img': ''}],0,False,maruti wagon r lxi cng,maruti,maruti wagon r,lxi cng,new delhi,240000,0,dealer,corporate,"['power steering', 'power windows front', 'air...","['power steering', 'power windows front', 'low...","['air conditioner', 'heater', 'electronic mult...","['adjustable head lights', 'manually adjustabl...","['centeral locking', 'child safety locks', 'da...",silver,k series petrol engine,3.0,4.0,,False,False,3595.0,1475.0,1700.0,2400.0,1295.0,1290.0,960.0,1350.0,5 speed,fwd,5.0,power,4.6,ventilated disc,drum,,,tubeless radial,4.0,,used,delhi,silver,second,Multi-Point Fuel Injection,,13.0,,58.2,6200.0,77.0,3500.0,,
4,b1eab99b-a606-48dd-a75b-57feb8a9ad92,mumbai g.p.o.,2022,muv,manual,cng,18900,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,6,False,maruti ertiga vxi cng,maruti,maruti ertiga,vxi cng,mumbai,1175000,0,dealer,partner,"['power steering', 'power windows front', 'air...","['power steering', 'power windows front', 'pow...","['air conditioner', 'heater', 'adjustable stee...","['adjustable head lights', 'power adjustable e...","['anti lock braking system', 'brake assist', '...",white,k15c,4.0,4.0,,True,True,4395.0,1735.0,1690.0,2740.0,,,1250.0,1820.0,5 speed,2wd,7.0,,5.2,disc,drum,,,tubeless radial,5.0,,used,maharashtra,white,first,,12.0,,,86.63,5500.0,121.5,4200.0,,


<h2 style="color: purple;">2.6 Checking for Missing Data</h2>  

In [7]:
print(f'Total number of missing values: {df.isnull().sum().sum()}')

Total number of missing values: 282038


In [8]:
pd.set_option('display.max_rows', None)

# Calculate the number of missing values per column
missing_values = df.isnull().sum()

# Calculate the percentage of missing values per column
missing_percentage = (missing_values / len(df)) * 100

# Create a DataFrame to display the results
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

# Sort the DataFrame by Missing Percentage in descending order
missing_info = missing_info.sort_values(by='Missing Percentage', ascending=False)

# Display the missing information
missing_info

Unnamed: 0,Missing Values,Missing Percentage
Stroke,37300,98.643
Ground Clearance Unladen,35661,94.309
Compression Ratio,27801,73.522
Bore,25177,66.583
Gross Weight,20890,55.246
Rear Tread,15875,41.983
Front Tread,15874,41.98
Top Speed,15352,40.6
Alloy Wheel Size,14461,38.243
Acceleration,13988,36.993


<h2 style="color: purple;">2.7 Checking for Duplicate Data</h2> 

In [9]:
print(f'There are {df.duplicated().sum()} duplicate rows in the dataset.')

There are 0 duplicate rows in the dataset.


<h2 style="color: purple;">2.8 Summary Statistics for Numeric Columns</h2> 

In [10]:
pd.set_option('display.float_format', '{:.3f}'.format)
numeric_columns = df.select_dtypes(include='number')
numeric_columns.describe()

Unnamed: 0,myear,km,ip,imgCount,listed_price,discountValue,No of Cylinder,Valves per Cylinder,Length,Width,Height,Wheel Base,Front Tread,Rear Tread,Kerb Weight,Gross Weight,Seats,Turning Radius,Top Speed,Acceleration,Doors,Cargo Volume,Compression Ratio,Alloy Wheel Size,Ground Clearance Unladen,Max Power Delivered,Max Power At,Max Torque Delivered,Max Torque At,Bore,Stroke
count,37813.0,37813.0,37813.0,37813.0,37813.0,37813.0,37670.0,37585.0,37451.0,37442.0,37450.0,37257.0,21939.0,21938.0,24674.0,16923.0,37795.0,31965.0,22461.0,23825.0,37769.0,31081.0,10012.0,23352.0,2152.0,37582.0,35566.0,37582.0,35837.0,12636.0,513.0
mean,2015.57,62409.297,0.01,15.571,799986.81,73.282,3.819,3.894,4113.623,1725.594,1577.555,2545.82,1470.019,1469.291,1146.357,1709.088,5.237,5.494,172.994,13.133,4.664,363.092,12.833,15.45,179.416,102.755,5102.916,178.772,3161.846,72.414,83.495
std,3.779,58472.162,0.101,8.358,3043044.881,2805.783,0.541,0.478,399.357,128.293,115.152,155.295,90.373,100.562,318.518,441.592,0.722,49.4,26.631,3.309,0.497,130.413,5.259,1.522,25.989,42.784,1056.28,106.353,1025.049,11.907,4.654
min,1983.0,101.0,0.0,0.0,11963.0,0.0,1.0,1.0,2752.0,1312.0,1165.0,1840.0,1143.0,15.0,451.0,785.0,0.0,3.5,70.0,2.5,2.0,20.0,4.17,7.0,91.0,10.8,1620.0,4.8,160.0,2.85,71.5
25%,2013.0,31739.0,0.0,10.0,320000.0,0.0,4.0,4.0,3795.0,1680.0,1495.0,2425.0,1420.0,1420.0,935.0,1350.0,5.0,4.8,156.0,11.07,4.0,256.0,10.1,14.0,178.0,74.96,4000.0,110.0,2125.0,69.6,82.5
50%,2016.0,56726.0,0.0,15.0,525000.0,0.0,4.0,4.0,3995.0,1710.0,1530.0,2520.0,1485.0,1493.0,1066.0,1595.0,5.0,5.1,168.56,13.2,5.0,350.0,10.5,16.0,190.0,88.5,5500.0,145.0,3375.0,73.0,85.0
75%,2018.0,83310.0,0.0,21.0,855272.0,0.0,4.0,4.0,4440.0,1790.0,1642.0,2647.0,1530.0,1525.0,1230.0,1967.5,5.0,5.3,189.0,14.85,5.0,465.0,16.2,16.0,192.0,118.35,6000.0,224.0,4000.0,76.5,85.8
max,2023.0,6300000.0,1.0,74.0,550000555.0,200000.0,16.0,48.0,5982.0,2236.0,2075.0,3772.0,1705.0,1834.0,2962.0,3490.0,14.0,6250.0,340.0,44.04,6.0,2055.0,81.0,21.0,213.0,769.31,8250.0,900.0,21800.0,165.0,104.9


<h2 style="color: purple;">2.9 Summary Statistics for Categorical Columns</h2> 

In [11]:
non_numeric_columns = df.select_dtypes(exclude='number')
print("Non-Numeric Columns Summary:")
non_numeric_columns.describe(include='all')

Non-Numeric Columns Summary:


Unnamed: 0,usedCarSkuId,loc,body,transmission,fuel,images,threesixty,dvn,oem,model,variant,City,utype,carType,top_features,comfort_features,interior_features,exterior_features,safety_features,Color,Engine Type,Valve Configuration,Turbo Charger,Super Charger,Gear Box,Drive Type,Steering Type,Front Brake Type,Rear Brake Type,Tyre Type,model_type_new,state,exterior_color,owner_type,Fuel Suppy System
count,37813,31963,37794,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37813,37653,36945,29851,37813,37813,37342,33316,37005,37486,37487,37556,37813,37813,37690,37813,32311
unique,37813,397,11,2,5,37135,2,4128,46,382,3430,617,2,3,400,2016,527,1893,2116,747,566,5,2,2,11,6,2,6,6,4,1,33,747,6,14
top,7111bf25-97af-47f9-867b-40879190d800,pune city,hatchback,manual,petrol,[{'img': ''}],False,maruti swift vxi,maruti,honda city,vxi,new delhi,dealer,partner,"['power steering', 'power windows front', 'air...","['power steering', 'power windows front', 'pow...","['air conditioner', 'heater', 'adjustable stee...","['adjustable head lights', 'fog lights front',...","['anti lock braking system', 'centeral locking...",white,in-line engine,dohc,False,False,5 speed,fwd,power,disc,drum,tubeless radial,used,maharashtra,white,first,Multi-Point Fuel Injection
freq,1,3483,15053,28894,21773,679,30333,347,9950,1911,1233,4969,31963,32512,6974,573,2745,434,718,10059,3370,23571,20247,20247,25203,27632,36348,23334,29973,29108,37813,8628,10067,26571,14055


<h1 style="color: brown;">3. Preprocessing </h1> 

<h2 style="color: purple;">3.1 Dropping Unnecessary Columns</h2>

In [12]:
# Removing columns that seems unnecessary to predict listed_price

df = df.drop(columns = ['usedCarSkuId','loc','myear','ip','images','imgCount','threesixty','dvn', 'oem', 'model',
                        'variant', 'City','state', 'top_features', 'comfort_features','interior_features',
                        'exterior_features', 'safety_features', 'Color', 'exterior_color'])

# Removing columns with more than 1000 missing values. keeping columns with less than 5 missing value percentage.

df = df.drop(columns = ['Stroke','Ground Clearance Unladen','Compression Ratio','Bore','Gross Weight','Rear Tread','Front Tread',
                        'Top Speed', 'Alloy Wheel Size', 'Acceleration',
                        'Kerb Weight', 'Valve Configuration','Cargo Volume', 'Turning Radius', 'Fuel Suppy System','Drive Type',
                        'Max Power At', 'Max Torque At'])

# Engine Type has 507 uniquq Categorical values so removing that column. Plus model_type_new has only one value used. so all cars are used so doesnt make sense to use the column hence removing it too

df = df.drop(columns = ['Engine Type', 'model_type_new'])


<h2 style="color: purple;">3.2 Dropping Null Values</h2> 

In [13]:
df = df.dropna()

<h2 style="color: purple;">3.3 Displaying Frequency of Categorical Data</h2> 

In [14]:
non_numeric_columns = df.select_dtypes(exclude='number')
for col in non_numeric_columns:
    # Get value counts
    value_counts = df[col].value_counts()

    # Display with a numbered list
    print(f"\nValue counts for {col}:")
    for idx, (value, count) in enumerate(value_counts.items(), start=1):
        print(f"{idx}. {value}: {count}")
    print("-" * 50)


Value counts for body:
1. hatchback: 14370
2. sedan: 10285
3. suv: 9128
4. muv: 1627
5. minivans: 360
6. luxury vehicles: 68
7. pickup trucks: 36
8. convertibles: 22
9. coupe: 13
10. wagon: 7
11. hybrids: 4
--------------------------------------------------

Value counts for transmission:
1. manual: 27621
2. automatic: 8299
--------------------------------------------------

Value counts for fuel:
1. petrol: 20503
2. diesel: 14196
3. cng: 996
4. lpg: 197
5. electric: 28
--------------------------------------------------

Value counts for utype:
1. dealer: 30536
2. individual: 5384
--------------------------------------------------

Value counts for carType:
1. partner: 30909
2. corporate: 4988
3. assured: 23
--------------------------------------------------

Value counts for Turbo Charger:
1. False: 19849
2. True: 16071
--------------------------------------------------

Value counts for Super Charger:
1. False: 19849
2. True: 16071
--------------------------------------------------


<h2 style="color: purple;">3.4 Preprocessing Non-Numeric Columns for Machine Learning</h2>  

In [15]:
# Define LabelEncoder
label_encoder = LabelEncoder()

# Iterate over the non-numeric columns
for col in non_numeric_columns:
    # Get the number of unique values in the column
    unique_values_count = df[col].nunique()

    if unique_values_count == 2:
        # Apply Label Encoding for columns with 2 unique values
        df[col] = label_encoder.fit_transform(df[col])

        # Display the mapping of label encoding
        print(f"\nLabel Encoding for column '{col}':")
        mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        print(f"  {mapping}")
    elif unique_values_count >= 3:
        # Apply One-Hot Encoding for columns with 3 or more unique values
        df_encoded = pd.get_dummies(df[col], prefix=col, dtype=int)

        # Drop the original column and replace it with the One-Hot Encoded columns
        df = df.drop(columns=[col])
        df = pd.concat([df, df_encoded], axis=1)

        print(f"\nOne-Hot Encoding applied to column '{col}':")
        print(f"  Created columns: {list(df_encoded.columns)}")


One-Hot Encoding applied to column 'body':
  Created columns: ['body_convertibles', 'body_coupe', 'body_hatchback', 'body_hybrids', 'body_luxury vehicles', 'body_minivans', 'body_muv', 'body_pickup trucks', 'body_sedan', 'body_suv', 'body_wagon']

Label Encoding for column 'transmission':
  {'automatic': 0, 'manual': 1}

One-Hot Encoding applied to column 'fuel':
  Created columns: ['fuel_cng', 'fuel_diesel', 'fuel_electric', 'fuel_lpg', 'fuel_petrol']

Label Encoding for column 'utype':
  {'dealer': 0, 'individual': 1}

One-Hot Encoding applied to column 'carType':
  Created columns: ['carType_assured', 'carType_corporate', 'carType_partner']

Label Encoding for column 'Turbo Charger':
  {False: 0, True: 1}

Label Encoding for column 'Super Charger':
  {False: 0, True: 1}

One-Hot Encoding applied to column 'Gear Box':
  Created columns: ['Gear Box_10 speed', 'Gear Box_4 speed', 'Gear Box_5 speed', 'Gear Box_6 speed', 'Gear Box_7 speed', 'Gear Box_8 speed', 'Gear Box_9 speed', 'Gear 

From here I guess make two version of the code. Use minmax scaler in one and then standard scaler in another.

<h1 style="color: brown;">4. Development and Assessment of Models</h1> 

<h2 style="color: purple;">4.1 Splitting Features and Target Variable</h2>

In [16]:
X = df.drop(columns=['listed_price'])
y = df['listed_price']

<h2 style="color: purple;">4.2 Splitting Data into Training and Test Sets</h2> 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: purple;">4.3 Scaling Features using StandardScaler</h2>

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h1 style="color: brown;">5. Evaluating Models and Measuring Performance</h1>  

<h2 style="color: purple;">5.1 Finding the Best Model</h2> 

In [19]:
model_list = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=42),
    RandomForestRegressor(random_state=42),
    GradientBoostingRegressor(random_state=42),
    AdaBoostRegressor(random_state=42),
    SVR(),
    GaussianProcessRegressor(),
    KNeighborsRegressor(),
]

results = []

for model in model_list:
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mse)
    results.append((model.__class__.__name__, r2, mae, mse, rmse))

# Display results
import pandas as pd
df_results = pd.DataFrame(results, columns=["Model", "R2", "MAE", "MSE", "RMSE"])
df_results = df_results.sort_values(by="R2", ascending=False)
print(df_results)

                       Model             R2           MAE  \
0           LinearRegression          0.383    283181.906   
3  GradientBoostingRegressor          0.364    235127.150   
4          AdaBoostRegressor          0.215    353363.379   
5                        SVR         -0.036    456272.999   
2      RandomForestRegressor         -4.799    173370.518   
7        KNeighborsRegressor         -6.937    275845.520   
1      DecisionTreeRegressor        -16.277    224233.215   
6   GaussianProcessRegressor -122034422.897 184793023.962   

                        MSE            RMSE  
0         1030930223352.652     1015347.341  
3         1062357625128.536     1030707.342  
4         1310722983257.023     1144868.107  
5         1730710887106.343     1315564.855  
2         9686495420234.254     3112313.516  
7        13258625384486.033     3641239.540  
1        28860562299379.090     5372202.742  
6 203850109919668633600.000 14277608690.522  


In [20]:
best_model_name = df_results.iloc[0]["Model"]
print(f"Best model based on R2: {best_model_name}")

Best model based on R2: LinearRegression


<h2 style="color: purple;">5.2 Manual Grid Search for Best Hyperparameters</h2>  

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Optional: split again for manual validation
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

param_grid = {
    'fit_intercept': [True],
    'positive': [False]  # Only available in newer versions of scikit-learn (>=0.24)
}

best_score = -np.inf
best_params = None

from itertools import product

for combo in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    model = LinearRegression(**params)
    model.fit(X_subtrain, y_subtrain)
    preds = model.predict(X_val)
    score = r2_score(y_val, preds)
    if score > best_score:
        best_score = score
        best_params = params

print("Best hyperparameters for LinearRegression:", best_params)

Best hyperparameters for LinearRegression: {'fit_intercept': True, 'positive': False}


<h2 style="color: purple;">5.3 Cross-Validation Evaluation</h2>   

In [22]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
import numpy as np

# Best parameters you found earlier
best_params = {
    'fit_intercept': True,
    'positive': False
}

# Initialize final model
final_model = LinearRegression(**best_params)

# Define scoring metrics
scoring = {
    'r2': 'r2',
    'neg_mae': 'neg_mean_absolute_error',
    'neg_mse': 'neg_mean_squared_error'
}

# Perform cross-validation
cv_results = cross_validate(final_model, X_train_scaled, y_train, cv=5, scoring=scoring)

# Extract scores
r2_scores = cv_results['test_r2']
mae_scores = -cv_results['test_neg_mae']
mse_scores = -cv_results['test_neg_mse']
rmse_scores = np.sqrt(mse_scores)

# Print individual and mean scores
print("CV RÂ² scores:", r2_scores)
print("Mean CV RÂ²:", np.mean(r2_scores))

print("CV MAE scores:", mae_scores)
print("Mean CV MAE:", np.mean(mae_scores))

print("CV MSE scores:", mse_scores)
print("Mean CV MSE:", np.mean(mse_scores))

print("CV RMSE scores:", rmse_scores)
print("Mean CV RMSE:", np.mean(rmse_scores))


CV RÂ² scores: [0.52899465 0.62090071 0.72406347 0.00864679 0.65061223]
Mean CV RÂ²: 0.5066435682820298
CV MAE scores: [287500.39491535 291651.29070771 260844.47730884 350067.02820052
 273742.71572782]
Mean CV MAE: 292761.18137204845
CV MSE scores: [4.36980802e+11 3.16180857e+11 1.90470098e+11 6.01849882e+13
 2.97880866e+11]
Mean CV MSE: 12285300157644.863
CV RMSE scores: [ 661045.23408946  562299.61536422  436428.8009229  7757898.4373765
  545784.63366916]
Mean CV RMSE: 1992691.344284448


<h2 style="color: purple;">5.4 Final Model Training and Test Set Evaluation</h2>   

In [23]:
# Retrain on the full training data
final_model.fit(X_train_scaled, y_train)
# Predict on the test data
y_test_preds = final_model.predict(X_test_scaled)
def regression_report(y_true, y_pred):
    print("Regression Report:")
    print(f"MAE:  {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"MSE:  {mean_squared_error(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"RÂ²:   {r2_score(y_true, y_pred):.4f}")
    
# Usage
regression_report(y_test, y_test_preds)

Regression Report:
MAE:  283181.9063
MSE:  1030930223352.6522
RMSE: 1015347.3412
RÂ²:   0.3828
