In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

df = pd.read_csv("car data.csv")

df.columns = df.columns.str.strip()

print(df.info())

le_car_name = LabelEncoder()
le_fueltype = LabelEncoder()
le_sp = LabelEncoder()
le_transmission = LabelEncoder()

df['Car_Name'] = le_car_name.fit_transform(df['Car_Name'])
df['Fuel_Type'] = le_fueltype.fit_transform(df['Fuel_Type'])
df['Selling_type'] = le_sp.fit_transform(df['Selling_type'])
df['Transmission'] = le_transmission.fit_transform(df['Transmission'])

df.drop_duplicates(inplace=True)

X = df.drop('Selling_Price', axis=1)
y = df['Selling_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Linear Regression R2 Score:", r2_score(y_test, y_pred_lr) * 100)
print("Linear Regression MAE:", mean_absolute_error(y_test, y_pred_lr))
print()

rf_model = RandomForestRegressor(n_estimators=300, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest R2 Score:", r2_score(y_test, y_pred_rf) * 100)
print("Random Forest MAE:", mean_absolute_error(y_test, y_pred_rf))
print()

xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost R2 Score:", r2_score(y_test, y_pred_xgb) * 100)
print("XGBoost MAE:", mean_absolute_error(y_test, y_pred_xgb))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB
None
Linear Regression R2 Score: 73.49707530639917
Linear Regression MAE: 1.5719610024166668

Random Forest R2 Score: 59.09588119760863
Random Forest MAE: 1.3296166666666658

XGBoost R2 Score: 79.72948007666973
XGBoost MAE: 1.0281714726090432


In [3]:
df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,90,2014,3.35,5.59,27000,2,0,1,0
1,93,2013,4.75,9.54,43000,1,0,1,0
2,68,2017,7.25,9.85,6900,2,0,1,0
3,96,2011,2.85,4.15,5200,2,0,1,0
4,92,2014,4.60,6.87,42450,1,0,1,0
...,...,...,...,...,...,...,...,...,...
296,69,2016,9.50,11.60,33988,1,0,1,0
297,66,2015,4.00,5.90,60000,2,0,1,0
298,69,2009,3.35,11.00,87934,2,0,1,0
299,69,2017,11.50,12.50,9000,1,0,1,0


Sample Data Testing

In [4]:
sample_data = ['Maruti Swift',2015,5.5,50000,'Petrol','Individual','Manual',0]
new_df=pd.DataFrame(columns=X_train.columns)
new_df.loc[0]=sample_data
new_df['Car_Name'] = le_car_name.fit_transform(new_df['Car_Name'])
new_df['Fuel_Type'] = le_fueltype.fit_transform(new_df['Fuel_Type'])
new_df['Selling_type'] = le_sp.fit_transform(new_df['Selling_type'])
new_df['Transmission'] = le_transmission.fit_transform(new_df['Transmission'])
pre_lr=lr_model.predict(new_df)
pre_rf=rf_model.predict(new_df)
pre_xgb=xgb_model.predict(new_df)
print("Predicted Selling Price (Linear Regression):",pre_lr)
print("Predicted Selling Price (Random Forest):",pre_rf)
print("Predicted Selling Price (XGBoost):",pre_xgb)

Predicted Selling Price (Linear Regression): [7.58014245]
Predicted Selling Price (Random Forest): [3.58883333]
Predicted Selling Price (XGBoost): [3.7863016]
