# Business Problem/Problem Statement
Develop a predictive model to estimate a car's fuel efficiency (mpg) based on various engine and car features, enhancing the decision-making process for car manufacturers and consumers.

# objective
The objective is to create an accurate regression model to predict the miles per gallon (mpg) of a car using input features such as engine displacement, horsepower, and weight.

# constraints
The model must be interpretable and easily integrable into existing systems while maintaining robust performance despite potential data quality issues like missing or inconsistent data.

# import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_excel(r"D:\skills certificates\mtcars.xlsx")

In [3]:
df.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [4]:
# or it will give rows and columns
df.shape

(32, 11)

In [5]:
df.dtypes

mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [6]:
df.describe

<bound method NDFrame.describe of      mpg  cyl    disp   hp  drat     wt   qsec  vs  am  gear  carb
0   21.0    6   160.0  110  3.90  2.620  16.46   0   1     4     4
1   21.0    6   160.0  110  3.90  2.875  17.02   0   1     4     4
2   22.8    4   108.0   93  3.85  2.320  18.61   1   1     4     1
3   21.4    6   258.0  110  3.08  3.215  19.44   1   0     3     1
4   18.7    8   360.0  175  3.15  3.440  17.02   0   0     3     2
5   18.1    6   225.0  105  2.76  3.460  20.22   1   0     3     1
6   14.3    8   360.0  245  3.21  3.570  15.84   0   0     3     4
7   24.4    4   146.7   62  3.69  3.190  20.00   1   0     4     2
8   22.8    4   140.8   95  3.92  3.150  22.90   1   0     4     2
9   19.2    6   167.6  123  3.92  3.440  18.30   1   0     4     4
10  17.8    6   167.6  123  3.92  3.440  18.90   1   0     4     4
11  16.4    8   275.8  180  3.07  4.070  17.40   0   0     3     3
12  17.3    8   275.8  180  3.07  3.730  17.60   0   0     3     3
13  15.2    8   275.8  180  

In [7]:
# it will give count and dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mpg     32 non-null     float64
 1   cyl     32 non-null     int64  
 2   disp    32 non-null     float64
 3   hp      32 non-null     int64  
 4   drat    32 non-null     float64
 5   wt      32 non-null     float64
 6   qsec    32 non-null     float64
 7   vs      32 non-null     int64  
 8   am      32 non-null     int64  
 9   gear    32 non-null     int64  
 10  carb    32 non-null     int64  
dtypes: float64(5), int64(6)
memory usage: 2.9 KB


In [8]:
# or two no the null values 
df.isnull().sum()

mpg     0
cyl     0
disp    0
hp      0
drat    0
wt      0
qsec    0
vs      0
am      0
gear    0
carb    0
dtype: int64

# data preprocessing

In [9]:
df.duplicated().sum()

0

In [10]:
df.duplicated().sum()

0

In [11]:
df.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [12]:
# no categorical values all are numerical values 

# model building

In [13]:
x=df.drop('mpg',axis=1)

In [14]:
y=df['mpg']

In [15]:
x

Unnamed: 0,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [16]:
y

0     21.0
1     21.0
2     22.8
3     21.4
4     18.7
5     18.1
6     14.3
7     24.4
8     22.8
9     19.2
10    17.8
11    16.4
12    17.3
13    15.2
14    10.4
15    10.4
16    14.7
17    32.4
18    30.4
19    33.9
20    21.5
21    15.5
22    15.2
23    13.3
24    19.2
25    27.3
26    26.0
27    30.4
28    15.8
29    19.7
30    15.0
31    21.4
Name: mpg, dtype: float64

# spliting into the train and test

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)


In [19]:
x_train.shape

(22, 10)

In [20]:
x_test.shape

(10, 10)

In [21]:
y_train.shape

(22,)

In [22]:
y_test.shape

(10,)

# algorithm testing for best accuracy
- knn
- svm
- linear regression
- decision tree


# using knn- k-nearest neighbour

In [23]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [24]:
knn=KNeighborsRegressor()
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

2.626944993714181


# using linear regression

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

In [26]:
lr=LinearRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

2.363465135799337


# using Decision tree

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [28]:
dt=DecisionTreeRegressor()
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

3.379497003993345


# using svr -support vector regressor

In [29]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,r2_score

In [30]:
S=SVR()
S.fit(x_train,y_train)
y_pred=S.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))

4.259466519587429


In [31]:
df.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [32]:
lr.predict([[7,250.0,150,3.95,2.634,17.86,1,0,3,3]])



array([19.8088954])

In [33]:
lr.coef_

array([-2.04550216e-01, -5.09491771e-04, -8.15718473e-03,  1.05761109e+00,
       -2.97892143e+00,  1.16182594e+00, -1.28281195e+00,  1.40719873e+00,
        1.08550451e+00, -2.83015954e-01])

In [34]:
lr.intercept_

4.385747758400633

In [35]:
# pattern of mtcars by using linear regression cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
cyl=7
disp=250.0
hp=150
drat=3.95
wt=2.634
qsec=17.86
vs=1
am=0
gear=3
carb=3
mpg= -2.04550216e-01 * cyl -5.09491771e-04 *disp -8.15718473e-03 *hp + 1.05761109e+00*drat - 2.97892143e+00 * wt + 1.16182594e+00 * qsec - 1.28281195e+00 * vs + 1.40719873e+00 * am + 1.08550451e+00 * gear - 2.83015954e-01 * carb + 4.385747758400633

In [36]:
print(mpg)

19.80889535943063


# pickel

In [38]:
import pickle

In [40]:
pickle.dump(lr,open("lr.pkl","wb"))

In [41]:
model=pickle.load(open("lr.pkl","rb"))

In [42]:
model.predict([[7,250.0,150,3.95,2.634,17.86,1,0,3,3]])



array([19.8088954])