### Scikit-Learn for Regression Problem


In [42]:
car_sales_dataset = '/home/hp/Documents/College/Coding/Machine Learning/zero_to_mastery_course/csv/car-sales-extended-missing-data.csv'

In [43]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [44]:
car_sales = pd.read_csv(car_sales_dataset)

In [45]:
car_sales.head(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [46]:
car_sales.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [47]:
car_sales.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [48]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [49]:
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [50]:
car_sales.dropna(subset=["Price"],inplace=True)

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [54]:
categorical_features = ["Make", "Colour"]
categorical_transformer  = Pipeline(steps=
                                        [("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                                        ("onehot", OneHotEncoder(handle_unknown="ignore"))])


door_features = ["Doors"]
door_transformer = Pipeline(steps=
                                [("imputer", SimpleImputer(strategy="constant", fill_value=4))])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=
                                    [("imputer", SimpleImputer(strategy="mean"))])


preprocessor = ColumnTransformer(transformers=[("cat", categorical_transformer, categorical_features),
                                                ("door", door_transformer, door_features),
                                                ("num", numeric_transformer,numeric_features)])

# model = Pipeline(steps=[("preprocessor", preprocessor),
#                         ("model", RandomForestRegressor())])

# #splitting the data
# X= car_sales.drop("Price", axis=1)
# y = car_sales["Price"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# model.fit(X_train, y_train)
# model.score(X_test, y_test)


In [56]:
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

In [60]:
regression_models = {"Ridge": Ridge(),
                    "SVR_linear" : SVR(kernel='linear'),
                    "SVR_rbf" : SVR(kernel='rbf'),
                    "RandomForestRegressor" : RandomForestRegressor()}

regression_results = {}

In [57]:
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2)

In [61]:
for model_name, model in regression_models.items():
    
    # Create a model Pipeline with a preprocessor step and model step
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("model", model)])
    
    # Fit the model Pipeline to the car sales training data
    print(f"Fitting {model_name}...")
    model_pipeline.fit(X_train, y_train)
    
    # Score the model Pipeline on the test data appending the model_name to the 
    # results dictionary
    print(f"Scoring {model_name}...")
    regression_results[model_name] = model_pipeline.score(X_test, y_test)

Fitting Ridge...
Scoring Ridge...
Fitting SVR_linear...
Scoring SVR_linear...
Fitting SVR_rbf...
Scoring SVR_rbf...
Fitting RandomForestRegressor...
Scoring RandomForestRegressor...


In [62]:
regression_results

{'Ridge': 0.3885016309254252,
 'SVR_linear': 0.051905191305613,
 'SVR_rbf': -0.04700137906920698,
 'RandomForestRegressor': 0.3929186731761023}

In [68]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ridge_pipeline = Pipeline(steps=[
                                ("preprocessor", preprocessor),
                                ("model", Ridge())
])

ridge_pipeline.fit(X_train, y_train)
car_y_preds = ridge_pipeline.predict(X_test)
car_y_preds[:50]

array([16974.35311117, 21134.43548333, 20036.0414533 , 19276.48072668,
       19180.01722869, 11953.10724129, 14917.49375996, 11432.07197269,
       10308.55755896, 12135.90958989, 10458.08905859, 16201.16268497,
       12849.97392439, 16063.12920109, 28586.78790135, 12173.16089175,
       11715.07342795, 11301.53786597, 17332.68869883, 11717.44568932,
       21906.64736607, 12251.69175193, 18917.13153059,  9782.58922255,
       11825.79094804, 17931.86632312, 24790.18526275, 17934.93636212,
       12392.42229641, 18883.46472902, 10313.71485614, 17633.63904623,
       15375.71151934, 19571.97958558, 10834.80082829, 13191.54519028,
       11370.9743835 , 18961.49831794,  9531.87730632, 14418.54071799,
       18849.67363573, 21123.52851654, 19427.6539997 , 13198.59702152,
       13711.65416284, 16791.02088354, 18338.37544803, 16723.94669624,
       14418.54071799, 20966.8508232 ])

In [69]:
mse = mean_squared_error(y_true=y_test, y_pred=car_y_preds)
mse

55896225.55405629

In [71]:
mae = mean_absolute_error(y_true=y_test, y_pred=car_y_preds)
mae

6294.461667942582

In [72]:
r2 = r2_score(y_true=y_test, y_pred = car_y_preds)
r2

0.3885016309254252