In [3]:
# importing important library
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# URL to the Auto MPG dataset (Check for the latest URL or availability on the UCI repository)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

# Column names based on the dataset description
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

# Read the dataset from the URL
# Note: The dataset uses various delimiters and contains missing values denoted as '?'
df = pd.read_csv(url, delim_whitespace=True, names=column_names, na_values='?', comment='\t')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
# check Data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car_name      398 non-null    object 
dtypes: float64(5), int64(3), object(1)
memory usage: 28.1+ KB


In [7]:
# Check shape of data (like number of columns and rows )
df.shape

(398, 9)

In [9]:
# remove unnesery column
df = df.drop('car_name', axis=1)

In [11]:
# show data after colunm drop
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [13]:
# remove all null values
df=df.dropna()

In [15]:
# split lable column or output column
y=df[['mpg']]

In [17]:
# show output column data
y.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


In [19]:
# split all features or input colunms
X=df.drop('mpg',axis=1)

In [21]:
#Show features columns data
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1


In [23]:
# split data for traing and testing 80% for trainnig and 20% for testing
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [25]:
# Define Model 
lm=LinearRegression()

In [27]:
# Train the model on trainning data 
lm.fit(X_train,y_train)

In [29]:
# predict value for a single data
lm.predict([[8,350.0,165.0,3693.0,11.5,70,1]])



array([[13.61842946]])

In [33]:
# test modal on test data 
y_pred=lm.predict(X_test)

In [35]:
# calculate Accurecy Of modal
r2_score(y_test, y_pred)

0.7901500386760351