## EDA

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Loading our dataset
car_sales = pd.read_csv('data/car_sales.csv')

In [None]:
type(car_sales)

In [None]:
car_sales.head()

In [None]:
car_sales.info()

In [None]:
car_sales.describe()

In [None]:
car_sales.dtypes

In [None]:
car_sales.isna().sum()

In [None]:
car_sales.dropna(subset=['Price'], inplace=True)

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(car_sales['Price'], kde=True)
plt.title('Price Distribution')
plt.show()

## Data Preprocessing

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
categorical_features = ['Make', 'Colour']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
numerical_features = ['Odometer (KM)']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [None]:
# Define Doors features
door_feature = ["Doors"]

# Create Doors transformer Pipeline
door_transformer = Pipeline(steps=[
    # Set SimpleImputer strategy to "constant" and fill value to 4
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))])

In [None]:
# Data preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features),
        ('door', door_transformer, door_feature)
])

## Model Building + Evaluation (Default Metrics)

In [None]:
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Import Ridge from sklearn's linear_model module
from sklearn.linear_model import Ridge

# Import SVR from sklearn's svm module
from sklearn.svm import SVR

# Import RandomForestRegressor from sklearn's ensemble module
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Create dictionary of model instances, there should be 4 total key, value pairs
# in the form {"model_name": model_instance}.
# Don't forget there's two versions of SVR, one with a "linear" kernel and the
# other with kernel set to "rbf".

regression_models = {
    'Ridge' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', Ridge()),
    ]),
    'SVR_linear' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR(kernel='linear'))
    ]),
    'SVR_rbf' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR(kernel='rbf'))
    ]),
    'RandomForestRegressor' : Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor())
    ])
}

results = {}

In [None]:
np.random.seed(42)

for model_name, model in regression_models.items():
    model.fit(X_train, y_train)
    results[model_name] = model.score(X_test, y_test)

results

## Evaluating Ridge Regression Model With Other Metrics

In [None]:
# Import mean_absolute_error from sklearn's metrics module
###
from sklearn.metrics import mean_absolute_error

# Import mean_squared_error from sklearn's metrics module
###
from sklearn.metrics import mean_squared_error

# Import r2_score from sklearn's metrics module
###
from sklearn.metrics import r2_score

In [None]:
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

ridge_pipeline.fit(X_train, y_train)

y_preds = ridge_pipeline.predict(X_test)
y_preds[:50]

In [None]:
regression_metrics = {
    'MSE' : mean_squared_error(y_test,y_preds),
    'MAE' : mean_absolute_error(y_test, y_preds),
    'R2S' : r2_score(y_test, y_preds)
}

for model_name, model_score in regression_metrics.items():
    print(f'{model_name} : {model_score}')

In [None]:
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
mae = mean_absolute_error(y_test, y_preds)
mae

In [None]:
r2s = r2_score(y_test, y_preds)
r2s

## Hyperparameter Tuning

### Ridge Tuning

In [None]:
# Ridge Hyperparameters

from scipy.stats import uniform
ridge_param_dist = {
    'regressor__alpha': uniform(loc=0.01, scale=10)  # alpha from 0.01 to 10.01 (continuous uniform)
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

ridge_rf = RandomizedSearchCV(estimator=ridge_pipeline,
                              param_distributions=ridge_param_dist,
                              n_iter=100,
                              cv=5,
                              verbose=True,
                              n_jobs=1,
                              random_state=42)

ridge_rf.fit(X_train, y_train)

In [None]:
ridge_rf.best_params_

In [None]:
ridge_rf.score(X_test, y_test)

In [None]:
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=6.8523302651215685))
])

ridge_pipeline.fit(X_train, y_train)
ridge_pipeline.score(X_test, y_test)