In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Function to preprocess data
def preprocess_data(data):
    # Drop unnecessary columns
    df = data.drop(columns=['ID', 'Doors', 'Levy'], errors='ignore')

    # Clean and convert 'Engine volume'
    df['Engine volume'] = df['Engine volume'].str.replace('Turbo', '', regex=True)
    df['Engine volume'] = pd.to_numeric(df['Engine volume'], errors='coerce')

    # Clean and convert 'Mileage'
    df['Mileage'] = df['Mileage'].str.split(" ", expand=True)[0]
    df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')

    # Drop rows with missing values (optional)
    df = df.dropna()

    # One-Hot Encoding for categorical variables
    df = pd.get_dummies(df, columns=[
        'Manufacturer', 'Model', 'Category', 'Leather interior',
        'Fuel type', 'Gear box type', 'Wheel', 'Color', 'Drive wheels'
    ])

    return df

# Load the data
data = pd.read_csv('/content/car_price_prediction.csv')

# Preprocess the data
df = preprocess_data(data)

# Define the target variable and features
y = df['Price']
x = df.drop('Price', axis=1)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=66)

# Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)

# Predict and evaluate Linear Regression
y_pred_lr = lr.predict(x_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("\n--- Linear Regression Results ---")
print(f"R² Score: {r2_lr:.2f}")
print(f"Mean Absolute Error: {mae_lr:.2f}")

# Random Forest Regressor with hyperparameters
rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=66)
rf.fit(x_train, y_train)

# Predict and evaluate Random Forest
y_pred_rf = rf.predict(x_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\n--- Random Forest Results ---")
print(f"R² Score: {r2_rf:.2f}")
print(f"Mean Absolute Error: {mae_rf:.2f}")

# Display the shapes of the datasets
print("\n--- Dataset Shapes ---")
print(f"Original Data Shape: {data.shape}")
print(f"Processed Data Shape: {df.shape}")
print(f"Features Shape: {x.shape}")
print(f"Target Shape: {y.shape}")
print(f"Training Set Shape: {x_train.shape}, {y_train.shape}")
print(f"Test Set Shape: {x_test.shape}, {y_test.shape}")



--- Linear Regression Results ---
R² Score: -0.00
Mean Absolute Error: 14385.73

--- Random Forest Results ---
R² Score: 0.00
Mean Absolute Error: 10652.82

--- Dataset Shapes ---
Original Data Shape: (19237, 18)
Processed Data Shape: (19237, 1706)
Features Shape: (19237, 1705)
Target Shape: (19237,)
Training Set Shape: (14427, 1705), (14427,)
Test Set Shape: (4810, 1705), (4810,)
