In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [24]:
%store -r data_2

In [25]:
#Split data into features and target

X = data_2.drop('mpg', axis=1)  
Y = data_2['mpg']

print(X.shape)
print(Y.shape)


(392, 6)
(392,)


In [26]:
#train and test split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [27]:
x_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year
151,4,79.0,67.0,2000,16.0,74
383,4,91.0,67.0,1965,15.0,82
70,8,400.0,190.0,4422,12.5,72
122,4,121.0,110.0,2660,14.0,73
212,8,350.0,180.0,4380,12.1,76


In [28]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((313, 6), (79, 6), (313,), (79,))

In [29]:
#Develop and train the linear regession model

linear_model = LinearRegression()

linear_model.fit(x_train,y_train)

In [30]:
# Check acciracy on test data

linear_model.score(x_test,y_test)

0.7850623739534026

In [31]:
# Shuffle split of train and test data to get more idea about model accuracy

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)

cross_val = cross_val_score(LinearRegression(), X, Y, cv=cv)

cross_val

array([0.78506237, 0.81749252, 0.8270676 , 0.8013235 , 0.73315396])

In [32]:
# Hyperparameter tuning using GridSearchCV to find best model with best parameters


def find_best_model(x,y):
    algos = {
        'linear_regression' :{
            'model': LinearRegression(),
            'params': {
                
            }
        },
        'lasso':{
            'model': Lasso(),
            'params':{
                'alpha': [0.001, 0.01, 0.1, 1, 2, 5],
                'selection':['random', 'cyclic']
            }
        },
        'Ridge':{
            'model': Ridge(),
            'params':{
                'alpha': [0.001, 0.01, 0.1, 1, 2, 5],
                'solver' : ['auto']
            }
        },
        'ElasticNet':{
            'model' : ElasticNet(),
            'params':{
                'alpha': [0.001, 0.01, 0.1, 1, 2, 5],
                'selection':['random', 'cyclic']
            }
        },
        'XGBRegressor':{
            'model': XGBRegressor(),
            'params': {
                'booster':['gbtree', 'dart'],
                'eta': [0.01, 0.05, 0.1],
                'max_depth': [4,5,6,7]
            }
        },
        'DecisionTree':{
            'model' : DecisionTreeRegressor(),
            'params':{
                'criterion': ['mse', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'RandomForest':{
            'model' : RandomForestRegressor(),
            'params':{
                'n_estimators':[10, 50, 100],
                'criterion': ['squared_error', 'absolute_error', 'poisson']
            }
        }
        
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(StandardScaler().fit_transform(x),y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
        
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

best_model = find_best_model(X,Y)

In [37]:
best_model

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.804469,{}
1,lasso,0.804928,"{'alpha': 0.1, 'selection': 'cyclic'}"
2,Ridge,0.804774,"{'alpha': 2, 'solver': 'auto'}"
3,ElasticNet,0.804701,"{'alpha': 0.01, 'selection': 'cyclic'}"
4,XGBRegressor,0.842192,"{'booster': 'gbtree', 'eta': 0.05, 'max_depth'..."
5,DecisionTree,0.757648,"{'criterion': 'friedman_mse', 'splitter': 'best'}"
6,RandomForest,0.855551,"{'criterion': 'poisson', 'n_estimators': 50}"


In [33]:
#From hyperparameter tuning we got to know RandomForest is the best model for this problem

model = RandomForestRegressor()

model.fit(x_train, y_train)

In [34]:
# Validate the accuracy

model.score(x_test, y_test)

0.863770105865083

In [35]:
#Validate the model working

model.predict([[8, 318.0, 150.0, 3436, 11.0, 70]])

array([17.26])

In [36]:
#Save model into pickle file format to deploy it to cloud

import pickle

with open('Fuel_efficiency_prediction', 'wb') as f:
    pickle.dump(model, f)