In [101]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 

# metadata 
print(auto_mpg.metadata) 
  
# variable information 
print(auto_mpg.variables) 
y

{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for th

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [81]:
y.isnull().sum()

mpg    0
dtype: int64

In [82]:
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score

In [83]:
imp = SimpleImputer(strategy='median').set_output(transform='pandas')
impute_data = imp.fit_transform(X)


In [62]:
impute_data.corr()

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
displacement,1.0,0.950721,0.895778,0.932824,-0.543684,-0.370164,-0.609409
cylinders,0.950721,1.0,0.841284,0.896017,-0.505419,-0.348746,-0.562543
horsepower,0.895778,0.841284,1.0,0.862442,-0.68659,-0.413733,-0.452096
weight,0.932824,0.896017,0.862442,1.0,-0.417457,-0.306564,-0.581024
acceleration,-0.543684,-0.505419,-0.68659,-0.417457,1.0,0.288137,0.205873
model_year,-0.370164,-0.348746,-0.413733,-0.306564,0.288137,1.0,0.180662
origin,-0.609409,-0.562543,-0.452096,-0.581024,0.205873,0.180662,1.0


In [105]:
X_train, X_test, Y_train, Y_test = train_test_split(impute_data, y, test_size=0.3, random_state=24)

#Applying ln(y+1) on  Y_train

y_tr_ln = np.log1p(Y_train)
print(y)
print(y_tr_ln)

      mpg
0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
..    ...
393  27.0
394  44.0
395  32.0
396  28.0
397  31.0

[398 rows x 1 columns]
          mpg
90   2.564949
54   3.583519
6    2.708050
146  3.367296
125  3.044522
..        ...
129  3.465736
145  3.496508
343  3.691376
192  3.135494
387  3.663562

[278 rows x 1 columns]


In [94]:
a = [0.01,0.1,0.5,1,2.5, 3,5]
r = [0.001, 0.3, 0.5, 0.7 , 0.9]

In [96]:
scores= []

for ele in a:
    for l1 in r:
        e1= ElasticNet(alpha=ele, l1_ratio=l1)
        e1.fit(X_train,y_tr_ln)
        y_pred = e1.predict(X_test)
        y_pred = np.expm1(y_pred)
        scores.append([ele, l1 , r2_score(Y_test, y_pred)])

In [98]:
df = pd.DataFrame(data = scores, columns= ['A', 'L1', 'r2'])
df.sort_values('r2', ascending=False).iloc[0]

A     0.010000
L1    0.900000
r2    0.805634
Name: 4, dtype: float64

# Iterative Imputer

In [70]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [78]:
it_imp = IterativeImputer(random_state=24)
X_imputed = it_imp.fit_transform(X)
X_train,X_test,Y_train,Y_test = train_test_split(X_imputed,y,test_size=0.3,random_state=24)
X_imputed

array([[307. ,   8. , 130. , ...,  12. ,  70. ,   1. ],
       [350. ,   8. , 165. , ...,  11.5,  70. ,   1. ],
       [318. ,   8. , 150. , ...,  11. ,  70. ,   1. ],
       ...,
       [135. ,   4. ,  84. , ...,  11.6,  82. ,   1. ],
       [120. ,   4. ,  79. , ...,  18.6,  82. ,   1. ],
       [119. ,   4. ,  82. , ...,  19.4,  82. ,   1. ]])

In [74]:
alpha = [0.01,0.1,0.5,1,2.5,3,5]
ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for a in alpha:
    for r in ratios:
        e1 = ElasticNet(alpha = a, l1_ratio=r)
        e1.fit(X_train,Y_train)
        y_pred = e1.predict(X_test)
        scores.append([a,r,r2_score(Y_test,y_pred)])

In [76]:
df = pd.DataFrame(scores, columns = ['Alpha','Ratios','r2_score'])
df.sort_values('r2_score',ascending=False).iloc[0]

Alpha       0.100000
Ratios      0.900000
r2_score    0.762388
Name: 9, dtype: float64

In [123]:
x = np.log10(100)
x

2.0