# Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load and Explore the Dataset

In [2]:
# Load the dataset
file_path = 'car data.csv' 
car_data = pd.read_csv(file_path)

In [3]:
# Explore the dataset
print("\n******** First 5 rows of the dataset ********\n")
print(car_data.head())
print("\n******** Dataset Info ********\n")
print(car_data.info())


******** First 5 rows of the dataset ********

  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  

******** Dataset Info ********

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-

# Step 2: Data Preprocessing

In [4]:
#Deriving 'Car_Age' from the 'Year' column
car_data['Car_Age'] = 2025 - car_data['Year']

In [5]:
# Drop irrelevant columns which are not contributing in Prediction
car_data = car_data.drop(columns=['Year', 'Car_Name'])

In [6]:
# Define categorical and numerical features
categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
numerical_features = ['Present_Price', 'Driven_kms', 'Owner', 'Car_Age']

In [7]:
# Separate features and target variable
X = car_data[categorical_features + numerical_features]
y = car_data['Selling_Price']

# Step 3: Train-Test Split

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")


Training set size: 240 samples
Testing set size: 61 samples


# Step 4: Data Transformation (Preprocessing)

In [9]:
# Perform one-hot encoding for categorical variables
preprocessor = ColumnTransformer(transformers=[('num', 'passthrough', numerical_features),
                                               ('cat', OneHotEncoder(sparse_output=False), categorical_features)])

# Step 5: Model Training

In [10]:
# Train a Random Forest Regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [11]:
print("\nTraining the Random Forest Regressor...")
model.fit(X_train, y_train)


Training the Random Forest Regressor...


# Step 6: Model Evaluation

In [12]:
# Predict on the test set
y_pred = model.predict(X_test)

In [13]:
# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")


Model Evaluation Metrics:
Mean Absolute Error (MAE): 0.65
Mean Squared Error (MSE): 0.96
R² Score: 0.96


# Step 7: Predicting Unseen Data

In [14]:
# Example unseen data (replace with real data for prediction)
unseen_data = pd.DataFrame({
    'Fuel_Type': ['Diesel'],
    'Selling_type': ['Individual'],
    'Transmission': ['Manual'],
    'Present_Price': [9.5],
    'Driven_kms': [45000],
    'Owner': [0],
    'Car_Age': [5]
})

# Predict price for the unseen data
predicted_price = model.predict(unseen_data)
print("\nPredicted Price for the unseen data:")
print(predicted_price)


Predicted Price for the unseen data:
[8.0323]


# Save the trained model to a pickle file

In [15]:
# Saving the trained model
import joblib
model_filename = 'car_price_prediction_surajnate_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

Model saved to car_price_prediction_surajnate_model.pkl
