In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('claus_mpg.csv')

In [3]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
388,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
389,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
390,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    object 
 8   name          392 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 27.7+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
lable = LabelEncoder()

In [7]:
for i in df.columns:
    if df[i].dtype == object:
        df[i] = lable.fit_transform(df[i])

In [8]:
for i in df.columns:
    p_75 = float(df.describe()[i].loc['75%'])
    p_25 = float(df.describe()[i].loc['25%'])
    IQR = float(df.describe()[i].loc['75%']) - float(df.describe()[i].loc['25%'])
    df[i] = df[i].apply(lambda x : np.mean(df[i]) if x < p_25 - IQR * 1.5 or x > p_75 + IQR * 1.5 else x)

In [9]:
X = df.drop('mpg', axis=1)
y = df['mpg']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()

In [16]:
X_train = scaler.fit_transform(X_train)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [18]:
model = RandomForestRegressor()

In [25]:
hyper_params =  {
    'n_estimators': [100, 150],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'min_weight_fraction_leaf': [0.0, 0.1],
    'max_leaf_nodes': [None, 10],
    # 'min_impurity_decrease': [0.0, 0.1],
    # 'bootstrap': [True, False],
    # 'oob_score': [False, True],
    # 'n_jobs': [None, -1],
    # 'random_state': [None, 42],
    # 'ccp_alpha': [0.0, 0.1],
    # 'max_samples': [None, 0.5]
}

In [26]:
full_model = GridSearchCV(model, hyper_params, cv=3, verbose=2)

In [27]:
full_model.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] END max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100; total time=   0.2s
[CV] END max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=150; total time=   0.3s
[CV] END max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=150; total time=   0.2s
[CV] END max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=150; total time=   0.2s
[CV] END max_d

In [28]:
y_pred = full_model.predict(X_test)



In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [30]:
mean_absolute_error(y_pred, y_test)

6.245305084745764

In [32]:
np.sqrt(mean_squared_error(y_pred, y_test))

7.95575343032122

In [34]:
r2_score(y_test, y_pred)

-0.19637494147433165