In [1]:
#predict MPG- Miles Per Gallon

import pandas as pd


# URL to the Auto MPG dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

# Column names
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower',
    'weight', 'acceleration', 'model_year', 'origin', 'car_name'
]

# Read the dataset
df = pd.read_csv(
    url,
    sep=r"\s+",          # replaces delim_whitespace=True
    names=column_names,
    na_values='?',
    comment='\t'
)

# Display first few rows
df.head()


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [2]:
df.shape

(398, 9)

In [3]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
df["horsepower"]=df["horsepower"].fillna(df["horsepower"].mean())

In [6]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [7]:
df.drop(columns=["car_name"], inplace=True)

In [8]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1


In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = df[["cylinders", "displacement", "horsepower", "weight", "acceleration"]]

In [12]:
y=df[["mpg"]]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
lm=LinearRegression()

In [15]:
lm.fit(X_train,y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [16]:
lm.predict([[8,307,130,3504,12]])



array([[18.60297563]])

In [17]:
from sklearn.metrics import r2_score

In [18]:
lm.coef_

array([[-0.19400655, -0.00637554, -0.04148099, -0.00522081, -0.0350301 ]])

In [19]:
lm.intercept_

array([46.21893137])

In [20]:
lm.predict(X_test)

array([[32.68335883],
       [27.04465307],
       [25.22065961],
       [15.38491509],
       [13.78879513],
       [29.63245052],
       [25.2019609 ],
       [ 7.83735583],
       [16.58426446],
       [23.84094649],
       [13.9011507 ],
       [31.29915928],
       [31.03425886],
       [15.74929268],
       [27.57244167],
       [ 8.48650311],
       [27.19188422],
       [21.96079878],
       [12.22942694],
       [31.14049598],
       [27.77891102],
       [24.51831641],
       [23.61395352],
       [29.98501828],
       [18.60297563],
       [30.68539821],
       [28.71670928],
       [25.96436731],
       [20.10036315],
       [ 9.13816146],
       [25.87043176],
       [30.83071353],
       [16.27962864],
       [28.34580063],
       [31.08303306],
       [ 9.90861832],
       [26.23195066],
       [18.36391973],
       [12.42616348],
       [29.21220345],
       [24.63353117],
       [32.09507883],
       [24.20501834],
       [ 8.86880621],
       [27.06289506],
       [29

In [21]:
y_pred=lm.predict(X_test)

In [22]:
r2 = r2_score(y_test, y_pred)

In [23]:
r2

0.7272038917065151