<a href="https://colab.research.google.com/github/Shafrinnisha/OIBSIP_Data_Science/blob/main/3_CarPrice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Load Data and Data Exploration

In [None]:
# Load the dataset
df = pd.read_csv('CarPrice.csv')

# Display a random sample of the dataset
df.sample(5)

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
13,14,0,bmw x3,gas,std,four,sedan,rwd,front,101.2,...,164,mpfi,3.31,3.19,9.0,121,4250,21,28,21105.0
114,115,0,peugeot 505s turbo diesel,diesel,turbo,four,wagon,rwd,front,114.2,...,152,idi,3.7,3.52,21.0,95,4150,25,25,17075.0
93,94,1,nissan titan,gas,std,four,wagon,fwd,front,94.5,...,97,2bbl,3.15,3.29,9.4,69,5200,31,37,7349.0
42,43,1,honda civic (auto),gas,std,two,sedan,fwd,front,96.5,...,110,2bbl,3.15,3.58,9.1,100,5500,25,31,10345.0
100,101,0,nissan nv200,gas,std,four,sedan,fwd,front,97.2,...,120,2bbl,3.33,3.47,8.5,97,5200,27,34,9549.0


In [None]:
# Display dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

Data Cleaning

In [None]:
# Check for missing values
df.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [None]:
# Check for duplicate rows
df.duplicated().sum()

0

In [None]:
# Drop unnecessary columns
df.drop(columns=['car_ID'],axis=1,inplace=True)

Data Preprocessing

In [None]:
# Identify numerical and categorical features
numerical_features =[]
categorical_features=[]
for i in df.columns:
    if (df[i].dtypes !=object):
        numerical_features.append(i)
    else:
        categorical_features.append(i)

In [None]:
x=df[numerical_features]
x.sample(2)

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
163,1,94.5,168.7,64.0,52.6,2169,98,3.19,3.03,9.0,70,4800,29,34,8058.0
161,0,95.7,166.3,64.4,52.8,2122,98,3.19,3.03,9.0,70,4800,28,34,8358.0


In [None]:
# Remove the target column from the feature lists
numerical_features.remove('price')

In [None]:
# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

Finding Best Model

In [None]:
# Split the data into features and target
X = df.drop('price', axis=1)

In [None]:
y = df['price']
y.head()

0    13495.0
1    16500.0
2    16500.0
3    13950.0
4    17450.0
Name: price, dtype: float64

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the models
models = {
    'RandomForest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ]),
    'GradientBoosting': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(random_state=42))
    ]),
    'XGBoost': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(random_state=42, objective='reg:squarederror'))
    ]),
    'LinearRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
}

In [None]:
# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MAE': mae, 'MSE': mse, 'R2': r2}

# Display results
results_df = pd.DataFrame(results).T
print(results_df)

                          MAE           MSE        R2
RandomForest      1323.837268  3.524119e+06  0.955359
GradientBoosting  1732.831822  5.578514e+06  0.929336
XGBoost           1608.597954  5.092694e+06  0.935490
LinearRegression  3700.772109  4.199681e+07  0.468018


Random Forest

In [None]:
# Define the model pipeline with RandomForest
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Calculate R2 score
r2 = r2_score(y_test,y_pred)*100
print("Accurcy:", r2)

Accurcy: 95.53592710518895


In [None]:
# Example prediction
example_data = {
    'symboling': 3,
    'CarName': 'toyota corolla',
    'fueltype': 'gas',
    'aspiration': 'std',
    'doornumber': 'four',
    'carbody': 'sedan',
    'drivewheel': 'fwd',
    'enginelocation': 'front','wheelbase': 88.6,
    'carlength': 168.8,
    'carwidth': 64.1,
    'carheight': 48.8,
    'curbweight': 2548,
    'enginetype': 'dohc',
    'cylindernumber': 'four',
    'enginesize': 130,
    'fuelsystem': 'mpfi',
    'boreratio': 3.47,
    'stroke': 2.68,
    'compressionratio': 9.0,
    'horsepower': 111,
    'peakrpm': 5000,
    'citympg': 21,
    'highwaympg': 27
}

In [None]:
# Convert the example data to a DataFrame
example_df = pd.DataFrame([example_data])

In [None]:
# Preprocess and predict the example data
example_pred = model.predict(example_df)
print(f"Predicted price for the example data: ${example_pred[0]:.2f}")

Predicted price for the example data: $14088.08
