In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('../Data/cardekho_dataset.csv')
df.shape

(15411, 14)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         15411 non-null  int64  
 1   car_name           15411 non-null  object 
 2   brand              15411 non-null  object 
 3   model              15411 non-null  object 
 4   vehicle_age        15411 non-null  int64  
 5   km_driven          15411 non-null  int64  
 6   seller_type        15411 non-null  object 
 7   fuel_type          15411 non-null  object 
 8   transmission_type  15411 non-null  object 
 9   mileage            15411 non-null  float64
 10  engine             15411 non-null  int64  
 11  max_power          15411 non-null  float64
 12  seats              15411 non-null  int64  
 13  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 1.6+ MB


In [4]:
num_features = df.select_dtypes(include=[np.number]).columns
print('Numerical Features:',num_features)
print(num_features)
cat_features = df.select_dtypes(include=['object']).columns
print('Categorical Features:',cat_features)
print(cat_features)
discrete_features = [feature for feature in num_features if len(df[feature].unique())<25]
print('Discrete Features Count:',len(discrete_features))
print(discrete_features)
continuous_features = [feature for feature in num_features if feature not in discrete_features]
print('Continuous Features Count:',len(continuous_features))
print(continuous_features)

Numerical Features: Index(['Unnamed: 0', 'vehicle_age', 'km_driven', 'mileage', 'engine',
       'max_power', 'seats', 'selling_price'],
      dtype='object')
Index(['Unnamed: 0', 'vehicle_age', 'km_driven', 'mileage', 'engine',
       'max_power', 'seats', 'selling_price'],
      dtype='object')
Categorical Features: Index(['car_name', 'brand', 'model', 'seller_type', 'fuel_type',
       'transmission_type'],
      dtype='object')
Index(['car_name', 'brand', 'model', 'seller_type', 'fuel_type',
       'transmission_type'],
      dtype='object')
Discrete Features Count: 2
['vehicle_age', 'seats']
Continuous Features Count: 6
['Unnamed: 0', 'km_driven', 'mileage', 'engine', 'max_power', 'selling_price']


In [5]:
df.drop(['Unnamed: 0', 'car_name'], axis=1, inplace=True)

In [6]:
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']
X.shape, y.shape

((15411, 11), (15411,))

In [7]:
len(df['model'].unique())

120

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])
X['brand'] = le.fit_transform(X['brand'])

In [9]:
num_features = num_features.delete(0)
num_features = num_features.delete(-1)
num_features

Index(['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats'], dtype='object')

In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

one_hot_columns = ['seller_type', 'fuel_type', 'transmission_type']

standard_scaler = Pipeline(steps=[('scaler', StandardScaler())])
one_hot_encoder = Pipeline(steps=[('onehot', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer(
    [
        ('num', standard_scaler, num_features),
        ('cat', one_hot_encoder, one_hot_columns)
    ],
    remainder='passthrough'
)

In [11]:
X_processed = preprocessor.fit_transform(X)

In [12]:
X_processed
pd.DataFrame(X_processed).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,18.0,7.0
1,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,54.0
2,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,118.0
3,0.983562,-0.360667,0.292211,-0.93661,-0.779312,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,18.0,7.0
4,-0.01206,-0.496281,0.735736,0.022918,-0.046502,-0.403022,0.0,0.0,1.0,0.0,0.0,0.0,1.0,6.0,38.0


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((12328, 15), (3083, 15))

In [14]:
from xgboost import XGBRegressor

In [15]:
xgb_reg = XGBRegressor()

In [16]:
xgb_reg.fit(X_train, y_train)

In [17]:
xgb_reg.score(X_test, y_test)

0.7643781900405884