In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
data = pd.read_csv("/content/drive/MyDrive/train-data.csv")
sns.set()
warnings.filterwarnings("ignore")

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [None]:
# Checking for the shape of the data
data.shape

(6019, 14)

In [None]:
# Understand the data
data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [None]:
# Check null values
data.isna().sum()

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [None]:
# Take actions on null values
data.drop(columns=['New_Price'], inplace=True)
data.dropna(inplace=True)

In [None]:
# Divide the data into X & y
X = data.iloc[:, 2:-1]
y = data.iloc[:, -1]

In [None]:
print(y.head(3))
X.head()

0     1.75
1    12.50
2     4.50
Name: Price, dtype: float64


Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
0,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0
1,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0
2,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0
3,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0


<h1>Extract Numeric Values from Mileage, Engine & Power Colns</h1>

In [None]:
def extract_numeric_value(value: str) -> float:
    try:
        strRep = str(value)
        floatRep = ""
        for char in strRep:
            if not char.isalpha() and not char.isspace() and char != '/':
                floatRep += char

        return float(floatRep)
    except:
        return None

In [None]:
X['Mileage'] = X['Mileage'].apply(extract_numeric_value)

In [None]:
X['Engine'] = X['Engine'].apply(extract_numeric_value)

In [None]:
X['Power'] = X['Power'].apply(extract_numeric_value)

In [None]:
X.isna().sum()

Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                0
Engine                 0
Power                103
Seats                  0
dtype: int64

In [None]:
X.shape

(5975, 10)

In [None]:
# filling the null values in `power` coln
X['Power'] = X['Power'].fillna(X['Power'].median())
X.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
dtype: int64

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4780, 10), (1195, 10), (4780,), (1195,))

In [None]:
# Create ColumnTransformer to encode and scale values of data
clf1 = ColumnTransformer([
    ('encode', OneHotEncoder(drop="first", sparse_output=True, handle_unknown="ignore"), ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type']),
    ('scaling', StandardScaler(), ['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats'])
], remainder="passthrough")

In [None]:
from sklearn.linear_model import LinearRegression

# Use LinearRegression
clf2 = LinearRegression()

In [None]:
pipe = Pipeline([
    ('ColumnTransformer', clf1),
    ('Model', clf2)
])

In [None]:
pipe.fit(X, y)

In [None]:
pipe.named_steps

{'ColumnTransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('encode',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore'),
                                  ['Location', 'Fuel_Type', 'Transmission',
                                   'Owner_Type']),
                                 ('scaling', StandardScaler(),
                                  ['Year', 'Kilometers_Driven', 'Mileage',
                                   'Engine', 'Power', 'Seats'])]),
 'Model': LinearRegression()}

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
print(f"The r2_score by LinearRegression Model is {r2_score(y_test, y_pred)}")
print(f"The Mean Squared Error by LinearRegression Model is {mse(y_test, y_pred)}")

The r2_score by LinearRegression Model is 0.6415590180270132
The Mean Squared Error by LinearRegression Model is 52.180657998160285


In [None]:
#measuring accuracy
print("Training set Accuracy:",round(pipe.score(X_train,y_train),4)*100)
print("Testing set Accuracy:",round(pipe.score(X_test,y_test),4)*100)

Training set Accuracy: 72.54
Testing set Accuracy: 64.16


<h1>Time to call my Best Friend Random Forest 🌲¶
</h1>

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf3 = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
pipe2 = Pipeline([
    ('clf1', clf1),
    ('clf3', clf3)
])

In [None]:
pipe2.fit(X_train, y_train)

In [None]:
y_pred = pipe2.predict(X_test)

In [None]:
print(f"The r2_score by Random Forest Regressor Model is {r2_score(y_test, y_pred)}")
print(f"The Mean Squared Error by Random Forest Regressor Model is {mse(y_test, y_pred)}")

The r2_score by Random Forest Regressor Model is 0.8403879723043517
The Mean Squared Error by Random Forest Regressor Model is 23.235793473546472


In [None]:
#measuring accuracy
print("Training set Accuracy:",round(pipe2.score(X_train,y_train),4)*100)
print("Testing set Accuracy:",round(pipe2.score(X_test,y_test),4)*100)

Training set Accuracy: 98.64
Testing set Accuracy: 84.04
