In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def rmse(y, y_hat):
    return np.sqrt(mean_squared_error(y, y_hat))


class CompanyNameExtracter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        # X -> DataFrame -> X["name"] which needs to be transformed
        X.loc[:, "name"] = X["name"].apply(self.process_string)
        return X
    @staticmethod
    def process_string(value):
        map_d = {'maxda': 'mazda',
        "toyouta":  "toyota",
        "vokswagen":  "vw",
        "volkswagen": "vw",}
        result = value.lower().strip().split(" ")
        name = result[0]
        if name in map_d.keys():
            name = map_d[name]
        return name

def get_data(dataset_name):
    return sns.load_dataset(dataset_name)

mpg = get_data("mpg")
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [6]:
X = mpg.drop("mpg", axis=1) #features
y = mpg["mpg"] #target
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [8]:
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [9]:
#Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.05,
                                                   random_state=145)
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
372,4,151.0,90.0,2735,18.0,82,usa,pontiac phoenix
68,8,350.0,155.0,4502,13.5,72,usa,buick lesabre custom
32,4,98.0,,2046,19.0,71,usa,ford pinto
195,4,85.0,52.0,2035,22.2,76,usa,chevrolet chevette
216,4,98.0,68.0,2045,18.5,77,japan,honda accord cvcc


In [10]:
y_train.head()

372    27.0
68     13.0
32     25.0
195    29.0
216    31.5
Name: mpg, dtype: float64

In [13]:
num_features = ["displacement", "horsepower", "weight", "acceleration"]
ord_features = ["name"]
nominal_features = ["origin"]
pass_through_cols = ["cylinders"]
drop_cols = ["model_year"]

numerical_pipeline = Pipeline([ ("imputer", SimpleImputer()), ("std scaler", StandardScaler())])
ordinal_pipeline = Pipeline([ ("extract company name", CompanyNameExtracter()),
                         ("ordinal encoder", OrdinalEncoder()),
                         ("std scaling", StandardScaler())
                        ])
nominal_pipeline = Pipeline([ ("one hot encoding", OneHotEncoder() ) ])

pipeline = ColumnTransformer([
    ("numerical pipeline", numerical_pipeline, num_features), #["displacement", "horsepower", "weight", "acceleration"]
    ("ordinal pipeline", ordinal_pipeline, ord_features), #  ["name"]
    ("nominal pipeline", nominal_pipeline, nominal_features), # ["europe", "japan", "usa"]
    ("passing columns", "passthrough", pass_through_cols), # ["cylinders"]
    ("drop columns", "drop", drop_cols)# model_year

])

output_cols = ["displacement", "horsepower", "weight", "acceleration",
               "name", "europe", "japan", "usa", "cylinders"]

X_train_tr = pipeline.fit_transform(X_train)
X_train_tr = pd.DataFrame(X_train_tr, columns=output_cols)

X_test_tr = pipeline.transform(X_test)
X_test_tr = pd.DataFrame(X_test_tr, columns=output_cols)

In [14]:
model = LinearRegression()
model.fit(X_train_tr, y_train)
print(model.intercept_, model.coef_)

27.484008268080387 [ 1.44872475 -2.13916249 -4.41988656  0.10485884  0.3768301  -0.26666083
  1.31518724 -1.04852641 -0.63055427]


In [16]:
def generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test):
    width = 116
    print("_"*width)
    title = "|{:^30}|{:^20}|{:^20}|{:^20}|{:^20}|"
    print(title.format("Model Name", "Train Error", "Test Error", "Train Accuracy", "Test Accuracy"))
    print("_"*width)
    row = "|{:^30}|{:^20.2f}|{:^20.2f}|{:^20.2f}|{:^20.2f}|"
    for model in models:
        model.fit(X_train_tr, y_train)
        y_hat_train = model.predict(X_train_tr)
        y_hat_test = model.predict(X_test_tr)
        train_error = rmse(y_train, y_hat_train)
        test_error = rmse(y_test, y_hat_test)
        train_accuracy = r2_score(y_train, y_hat_train)
        test_accuracy = r2_score(y_test, y_hat_test)
        model_name = str(model)
        print(row.format(model_name, train_error, test_error, train_accuracy*100, test_accuracy*100))
        print("-"*width)



from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = [LinearRegression(), SGDRegressor(), SVR(), DecisionTreeRegressor(), RandomForestRegressor()]

generate_models_report(models, X_train_tr, y_train, X_test_tr, y_test)


____________________________________________________________________________________________________________________
|          Model Name          |    Train Error     |     Test Error     |   Train Accuracy   |   Test Accuracy    |
____________________________________________________________________________________________________________________
|      LinearRegression()      |        4.17        |        3.45        |       71.90        |       66.07        |
--------------------------------------------------------------------------------------------------------------------
|        SGDRegressor()        |        4.61        |        3.30        |       65.77        |       69.01        |
--------------------------------------------------------------------------------------------------------------------
|            SVR()             |        4.18        |        3.53        |       71.79        |       64.51        |
----------------------------------------------------------------