In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [4]:
df=pd.read_csv('/content/cars.csv')
df

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


In [5]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['selling_price']),df['selling_price'],test_size=0.2,random_state=42)

In [6]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner


In [9]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


In [12]:
transformer=ColumnTransformer(
    [
        ('ordinal',OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner', 'Third Owner', 'Second Owner', 'First Owner']]),['owner']),
        ('onehot',OneHotEncoder(sparse_output=False),['brand','fuel'])
    ],
    remainder='passthrough'
)

In [13]:
transformer

In [15]:
X_train_transformed=transformer.fit_transform(X_train)
X_test_transformed=transformer.transform(X_test)

In [16]:
transformer.set_output(transform='pandas')

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [19]:
transformer.get_feature_names_out()

array(['ordinal__owner', 'onehot__brand_Ambassador',
       'onehot__brand_Ashok', 'onehot__brand_Audi', 'onehot__brand_BMW',
       'onehot__brand_Chevrolet', 'onehot__brand_Daewoo',
       'onehot__brand_Datsun', 'onehot__brand_Fiat',
       'onehot__brand_Force', 'onehot__brand_Ford', 'onehot__brand_Honda',
       'onehot__brand_Hyundai', 'onehot__brand_Isuzu',
       'onehot__brand_Jaguar', 'onehot__brand_Jeep', 'onehot__brand_Kia',
       'onehot__brand_Land', 'onehot__brand_Lexus', 'onehot__brand_MG',
       'onehot__brand_Mahindra', 'onehot__brand_Maruti',
       'onehot__brand_Mercedes-Benz', 'onehot__brand_Mitsubishi',
       'onehot__brand_Nissan', 'onehot__brand_Opel',
       'onehot__brand_Peugeot', 'onehot__brand_Renault',
       'onehot__brand_Skoda', 'onehot__brand_Tata',
       'onehot__brand_Toyota', 'onehot__brand_Volkswagen',
       'onehot__brand_Volvo', 'onehot__fuel_CNG', 'onehot__fuel_Diesel',
       'onehot__fuel_LPG', 'onehot__fuel_Petrol', 'remainder__km_drive

In [20]:
transformer.n_features_in_

4

In [21]:
transformer.transformers_

[('ordinal',
  OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner',
                              'Third Owner', 'Second Owner', 'First Owner']]),
  ['owner']),
 ('onehot', OneHotEncoder(sparse_output=False), ['brand', 'fuel']),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [1])]

In [22]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [23]:
import numpy as np

np.random.seed(42)
missing_km_indices = np.random.choice(df.index, size=int(0.05*len(df)), replace=False)
df.loc[missing_km_indices, 'km_driven'] = np.nan

# Introduce missing values in 'owner' column (1% missing values)
missing_owner_indices = np.random.choice(df.index, size=int(0.01*len(df)), replace=False)
df.loc[missing_owner_indices, 'owner'] = np.nan



In [24]:
df.isnull().sum()

Unnamed: 0,0
brand,0
km_driven,406
fuel,0
owner,81
selling_price,0


In [25]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns='selling_price'),df['selling_price'],random_state=42,test_size=0.2)

In [26]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560.0,Petrol,First Owner
6144,Honda,80000.0,Petrol,Second Owner
6381,Hyundai,150000.0,Diesel,Fourth & Above Owner
438,Maruti,120000.0,Diesel,Second Owner
5939,Maruti,25000.0,Petrol,First Owner


In [29]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [30]:
X_train.head(1)

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560.0,Petrol,First Owner


In [31]:
tf1=ColumnTransformer(
    [
        ('imput_km_driven',SimpleImputer(strategy='mean'),[1]),
        ('imput_owner',SimpleImputer(strategy='most_frequent'),[3])
    ],
    remainder='passthrough'
)

In [32]:
X_train.head(1)

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560.0,Petrol,First Owner


In [33]:
trf2=ColumnTransformer(
    [
        ('ordinal',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),[3]),
        ('Onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False),[0,2])
    ],
    remainder='passthrough'
)

In [34]:
# Scaling
trf3=ColumnTransformer(
    [
        ('scale',MinMaxScaler(),slice(0,38))
    ]
)

In [35]:
# Feature Selection
trf4=SelectKBest(score_func=chi2,k=10)

In [36]:
# Train model
trf5=RandomForestRegressor()

In [37]:
from sklearn.pipeline import Pipeline
pipe=Pipeline(
    [
        ('imputer',tf1),
        ('encoder',trf2),
        ('scaler',trf3),
        ('fselector',trf4),
        ('model',trf5)
    ]
)

In [38]:
pipe.fit(X_train,y_train)

In [39]:
pipe.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [40]:
pipe.named_steps

{'imputer': ColumnTransformer(remainder='passthrough',
                   transformers=[('imput_km_driven', SimpleImputer(), [1]),
                                 ('imput_owner',
                                  SimpleImputer(strategy='most_frequent'),
                                  [3])]),
 'encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('ordinal',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  [3]),
                                 ('Onehot',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [0, 2])]),
 'scaler': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 38, None))]),
 'fselector': SelectKBest(score_func=<function chi2 at 0x78cf941d93a0>),
 'model': Ran

In [41]:
# Predict
pipe.predict(np.array(['Maruti',100000.0,'Diesel','First Owner']).reshape(1,4))



array([631107.85033222])

In [42]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

np.float64(-639113244101.0538)

HyperParameter Tuning

In [43]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
}

In [44]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [45]:
grid.best_score_

np.float64(-639088313514.5734)

In [46]:
grid.best_params_

{'model__max_depth': None}

In [47]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))