In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> # **Car Price Prediction** 

# **Import Libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# **Import Dataset**

In [None]:
df = pd.read_csv('/kaggle/input/car-price-predictionused-cars/car data.csv')
df

# **Data Preprocessing**

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
display(df.describe())

In [None]:
df.shape

In [None]:
df.duplicated().any()

In [None]:
duplicate_values = df.duplicated().sum()
duplicate_values

In [None]:
null_values = df.isna().sum()
null_values

In [None]:
df.drop_duplicates(inplace= True)

# **Exploratory Data Analysis**

In [None]:
df.hist(figsize  = (12,12))

In [None]:
numerical_columns = ['Year', 'Selling_Price', 'Present_Price', 'Driven_kms', 'Owner']

numerical_df = df[numerical_columns]

corr_matrix = numerical_df.corr()

In [None]:
corr_matrix

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(corr_matrix, annot=True, cmap='pink',  linewidths=1,fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()

1. Selling price and present price show a strong positive correlation which means present price of cars likely influences selling price.
2. year and Driven kms show a negative correlation which means cars which are newer(lower in year value) will have lower mileage and wlillm more sell.

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
sns.pairplot(df[['Present_Price', 'Year', 'Driven_kms', 'Selling_Price']])
plt.show()

In [None]:
columns = ['Fuel_Type', 'Transmission', 'Owner', 'Selling_type']

for column in columns:
    plt.figure(figsize = (5,4))
    sns.countplot(x = column, data = df)
    plt.title(f'this is a count plot of {column}')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
sns.boxplot(df['Selling_Price'])
plt.title('Selling Price Distribution')
plt.show()

In [None]:
sns.boxplot(df['Present_Price'])
plt.title('Present Price Distribution')
plt.show()

# **Feature Engineering¶**

In [None]:
df['Car_Age'] = 2024 - df['Year']
df.head()

# **Data Transformation**

In [None]:
df.dtypes

In [None]:
df['Fuel_Type'].value_counts()

In [None]:
df['Selling_type'].value_counts()

In [None]:
df['Transmission'].value_counts()

In [None]:
ordinal_map = {'CNG':2,'Diesel':1,'Petrol':0}
df['Fuel_Type'] = df['Fuel_Type'].map(ordinal_map)
df['Fuel_Type'] = df['Fuel_Type'].astype('int64')

In [None]:
df = pd.get_dummies(df, columns = ['Selling_type','Transmission'])

In [None]:
df.dtypes

In [None]:
bool_columns = ['Selling_type_Dealer','Selling_type_Individual', 'Transmission_Automatic', 'Transmission_Manual']

df[bool_columns] = df[bool_columns].astype('int64')

In [None]:
df.head()

# **Model Building¶**

In [None]:
X = df.drop(['Car_Name','Selling_Price'],axis=1)
y = df['Selling_Price']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

In [None]:
def evaluation(y, predictions):
    return {
        'MAE': mean_absolute_error(y, predictions),
        'MSE': mean_squared_error(y, predictions),
        'RMSE': np.sqrt(mean_squared_error(y, predictions)),
        'R-squared': r2_score(y, predictions)
    }

## **Linear Regression Model**

In [None]:
from sklearn.linear_model import LinearRegression

model_name = "LinearRegression"

# Build a Linear Regression model
lin_reg = LinearRegression()

# Fit the model on the training data
lin_reg.fit(X_train, y_train)

# Predict using the model on the data
predictions = lin_reg.predict(X_test)

# Function call to Evaluate results of 'MAE', 'MSE', 'RMSE', 'R-squared'
evaluation_results_l = evaluation(y_test, predictions)

evaluation_results_l["Model"] = model_name


models = pd.DataFrame([evaluation_results_l])

print("Evaluation Results for", model_name)
print('-'*30)
for metric, value in evaluation_results_l.items():
    print(f"{metric}: {value}")

## **Random Forest Model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_name = "RandomForest"

# Build a Random Forest model
rand_for = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model on the training data
rand_for.fit(X_train, y_train)

# Predict using the model on the test data
predictions = rand_for.predict(X_test)

# Function call to Evaluate results of 'MAE', 'MSE', 'RMSE', 'R-squared'
evaluation_results_r = evaluation(y_test, predictions)


evaluation_results_r["Model"] = model_name


models = pd.DataFrame([evaluation_results_r])

print("Evaluation Results for", model_name)
print('-'*30)
for metric, value in evaluation_results_r.items():
    print(f"{metric}: {value}")

## **Gradient Regressor Model**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model_name = "GradientBoostingRegressor"

# Build a Gradient Boost model
g_boost = GradientBoostingRegressor(random_state=0)

# Fit the model on the training data
g_boost.fit(X_train, y_train)

# Predict using the model on the test data
predictions = g_boost.predict(X_test)

# Function call to Evaluate results of 'MAE', 'MSE', 'RMSE', 'R-squared'
evaluation_results_g = evaluation(y_test, predictions)

evaluation_results_g["Model"] = model_name

models = pd.DataFrame([evaluation_results_g])

print("Evaluation Results for", model_name)
print('-'*30)
for metric, value in evaluation_results_g.items():
    print(f"{metric}: {value}")


## **Models Comparison¶**

In [None]:
model_results = [
    {"model": "LinearRegression", "r2 score": evaluation_results_l['R-squared']},
    {"model": "RandomForestRegressor", "r2 score": evaluation_results_r['R-squared']},
    {"model": "XGBRegressor", "r2 score": evaluation_results_g['R-squared']},
]

In [None]:
#sort models according to there results
sorted_results = sorted(model_results, key=lambda x: x["r2 score"])

# Print the sorted results
for result in sorted_results:
    print(result)

In [None]:
model_names = [model["model"] for model in model_results]
rmse_values = [model["r2 score"] for model in model_results]

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(model_names, rmse_values, color=['green', 'blue', 'orange'])
plt.xlabel('Models')
plt.ylabel('r2 score')
plt.title('Comparison of r2 score Values for Different Models')
plt.show()
     

The results are clear that **linear regression model** is giving good performance as its r2 score is higher compared to other models.