In [16]:
import pandas as pd

df = pd.read_csv("car data.csv")
df
# Step 2: Dataset Structure

print("First 5 Rows:\n")
print(df.head())

print("\n\nDataset Information:\n")
print(df.info())

print("\n\nMissing Values:\n")
print(df.isnull().sum())

# Step 3: Data Cleaning

# 1. Remove Car_Name (not useful for prediction)
df_clean = df.drop("Car_Name", axis=1)

# 2. Convert categorical columns using one-hot encoding
df_clean = pd.get_dummies(df_clean, drop_first=True)

print("üîç Cleaned Dataset (First 5 rows):\n")
print(df_clean.head())

print("\n\nüìå Cleaned Dataset Info:\n")
print(df_clean.info())


# Step 4: Feature Selection

# Target variable
y = df_clean["Selling_Price"]

# Features (remove Selling_Price from dataset)
X = df_clean.drop("Selling_Price", axis=1)

print("üéØ Target (y) ‚Äì First 5 values:")
print(y.head())

print("\nüß© Features (X) ‚Äì First 5 rows:")
print(X.head())

print("\nShape of X:", X.shape)
print("Shape of y:", y.shape)

# Step 5: Train-Test Split

from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("üìò X_train shape:", X_train.shape)
print("üìô X_test shape:", X_test.shape)
print("üìó y_train shape:", y_train.shape)
print("üìï y_test shape:", y_test.shape)

# Step 6: Train the Model (Random Forest Regressor)

from sklearn.ensemble import RandomForestRegressor

# Create the model
model = RandomForestRegressor(
    n_estimators=200,      # number of trees
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

print("üéâ Model training completed successfully!")


# Step 7: Evaluate the Model

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Predict on test data
y_pred = model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("üìä MODEL EVALUATION RESULTS")
print("----------------------------")
print(f"R¬≤ Score: {r2:.4f}")
print(f"MAE (Mean Absolute Error): {mae:.4f}")
print(f"MSE (Mean Squared Error): {mse:.4f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")

# Show prediction vs actual (first 10)
print("\nüîç Actual vs Predicted (First 10)")
for actual, pred in list(zip(y_test[:10], y_pred[:10])):
    print(f"Actual: {actual:.2f}  |  Predicted: {pred:.2f}")

import pickle

# Step 8: Save the trained model
filename = "/content/car_price_model.pkl" # Changed path to /content/
pickle.dump(model, open(filename, "wb"))


import pickle
import numpy as np

# Load model
model = pickle.load(open("/content/car_price_model.pkl", "rb"))

# Example new data ‚Äî you can change these values
# Order of features:
# [Year, Present_Price, Driven_kms, Owner, Fuel_Type_Diesel, Fuel_Type_Petrol,
#  Selling_type_Individual, Transmission_Manual]

# Create a DataFrame for new_data with column names to match the training data
new_data_values = [[2017, 8.59, 32000, 0, 0, 1, 0, 1]]
new_data_df = pd.DataFrame(new_data_values, columns=X.columns)

predicted_price = model.predict(new_data_df)
predicted_price

pickle.dump(model, open("car_price_model.pkl", "wb"))



First 5 Rows:

  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  


Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 no