## **CAR PRICE PREDICTION USING MACHINE LEARNING**
### Task 3 - Oasis Infobyte Internship
- SANTHA LAKSHMI S

In [21]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [22]:
# Step 1: Load the dataset
data = pd.read_csv('/content/car data.csv')

In [32]:
# Display basic information about the dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB
None


In [33]:
# Display the first few rows of the dataset
print(data.head())

  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  


In [35]:
# Check for missing values
print("Missing Values:")
print(data.isnull().sum())

Missing Values:
Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64


In [23]:
# Step 2: Data Preprocessing
X = data.drop('Selling_Price', axis=1)
y = data['Selling_Price']

categorical_features = ['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission']
numeric_features = ['Year', 'Present_Price', 'Driven_kms', 'Owner']

In [25]:
# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Step 4: Create a preprocessing and modeling pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [27]:
# Step 5: Train the model
pipeline.fit(X_train, y_train)

# Step 6: Predict and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.743724599672131


In [28]:
# Step 7: Feature Importance
feature_importances = pipeline.named_steps['regressor'].feature_importances_
print('Feature Importances:')
for feature, importance in zip(X.columns, feature_importances):
    print(f'{feature}: {importance}')

Feature Importances:
Car_Name: 0.06117277125971736
Year: 0.8622375350049418
Present_Price: 0.03353661986074908
Driven_kms: 0.0006419115871221753
Fuel_Type: 4.25283030179711e-05
Selling_type: 4.84033448968732e-08
Transmission: 1.1268511340075464e-06
Owner: 1.7196544960530523e-07


In [29]:
# Print lengths for debugging
print(f'Length of feature importances: {len(feature_importances)}')
print(f'Length of X.columns: {len(X.columns)}')

Length of feature importances: 99
Length of X.columns: 8


In [30]:
# Print feature names for debugging
print('Feature names:', X.columns)
print('Feature importances:', feature_importances)

Feature names: Index(['Car_Name', 'Year', 'Present_Price', 'Driven_kms', 'Fuel_Type',
       'Selling_type', 'Transmission', 'Owner'],
      dtype='object')
Feature importances: [6.11727713e-02 8.62237535e-01 3.35366199e-02 6.41911587e-04
 4.25283030e-05 4.84033449e-08 1.12685113e-06 1.71965450e-07
 1.78640613e-07 2.44701968e-06 3.15708927e-07 1.12497971e-07
 1.44255746e-08 7.66357849e-08 1.58059512e-06 5.37573387e-08
 1.25557950e-06 5.11988422e-07 3.29209919e-07 1.99543571e-08
 1.65415345e-07 1.01024193e-07 4.15637716e-07 4.06307635e-08
 1.44401863e-06 5.45557373e-07 2.87827822e-08 6.67441898e-08
 8.41684835e-07 1.46409056e-07 2.28905094e-08 4.18869449e-08
 8.67250453e-09 1.29323238e-07 4.04162553e-07 2.63885252e-07
 5.39377566e-08 1.12156134e-05 3.27441181e-06 2.40636768e-07
 3.59359267e-07 1.07333643e-06 1.31875034e-06 9.45745743e-08
 3.43223843e-07 2.25907161e-07 1.12276935e-06 4.09018995e-07
 1.08069076e-06 1.85591414e-07 8.79451099e-07 5.33002047e-07
 1.92563601e-07 3.76602152e-0