In [14]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,r2_score
from preprocessing  import convert_dtype, MultiColumnLabelEncoder
# Loading the dataset
df = pd.read_csv('co2_emissions.csv', sep=';')

# Defining a function for changing data types
# def convert_dtype(X):
#     X_copy = X.copy()
#     X_copy['cylinders'] = X_copy['cylinders'].astype('category')
#     return X_copy

# Creating a custom transformer for LabelEncoder
# class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
#     def __init__(self, columns=None):
#         self.columns = columns

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         output = X.copy()
#         if self.columns is not None:
#             for col in self.columns:
#                 le = LabelEncoder()
#                 output[col] = le.fit_transform(output[col])
#         return output

#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X)

# Defining the feature transformation steps
feature_transform = ColumnTransformer(
    transformers=[
        ('label_encoder', MultiColumnLabelEncoder(columns=['make', 'model', 'vehicle_class', 'transmission', 'fuel_type']),
         ['make', 'model', 'vehicle_class', 'transmission', 'fuel_type']),
    ]
)

# Using FunctionTransformer to make convert_dtype compatible with ColumnTransformer
data_cleaning = FunctionTransformer(convert_dtype, validate=False)

# Combining the steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('data_cleaning', data_cleaning, ['fuel_consumption_comb(mpg)', 'cylinders']),
        ('feature_transform', feature_transform, ['make', 'model', 'vehicle_class', 'transmission', 'fuel_type'])
    ]
)

# Creating the final pipeline with Linear Regression model
lin_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

set_config(display='diagram')

# Splitting the data into train and test sets
X = df.drop('co2_emissions', axis=1)
y = df['co2_emissions']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting the pipeline on the dataset
lin_reg_pipe.fit(X_train, y_train)

# Evaluating the model
test_score = lin_reg_pipe.score(X_test, y_test)
y_pred = lin_reg_pipe.predict(X_test)
print(f'R2 score on test data: {test_score:.2f}')

mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,y_pred)
mape = mean_absolute_percentage_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print('\n\nmse ',mse)
print('rmse ',rmse)
print('mae ',mae)
print('mape ',mape)
print('r2: ',r2)
print('\n\n')
lin_reg_pipe

R2 score on test data: 0.90


mse  353.872535118043
rmse  18.81150007623111
mae  12.868001253738488
mape  0.05158698998830435
r2:  0.8971189339802288





In [15]:
# Now you can use the pipeline for prediction
# For example:
new_data = pd.DataFrame({'make': ['ACURA'], 'model': ['ILX'], 'vehicle_class': ['COMPACT'], 'engine_size': [2.0], 'cylinders': [4],
                         'transmission': ['AS'], 'fuel_type': ['Z'], 'fuel_consumption_city': [9.9], 'fuel_consumption_hwy': [6.7],
                         'fuel_consumption_comb(l/100km)': [8.5],'fuel_consumption_comb(mpg)': [33]})
prediction = lin_reg_pipe.predict(new_data)
prediction

array([189.51577178])

In [16]:
import joblib
joblib.dump(lin_reg_pipe, 'co2_emit_pred_model.pkl')

['co2_emit_pred_model.pkl']

In [17]:
joblib.load('co2_emit_pred_model.pkl')

In [18]:
import joblib
joblib.load('co2_emit_pred_model.pkl')