In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Ethans/Datasets/auto-mpg.csv')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


**Dataset Description**

The dataset appears to be related to automobiles and contains information about different car models. Here's a breakdown of each column in the dataset:

mpg: Miles per gallon, representing the fuel efficiency of the car measured in how many miles it can travel on one gallon of fuel.

cylinders: The number of cylinders in the engine of the car, indicating its power and performance characteristics.

displacement: The displacement of the engine, typically measured in cubic inches. It represents the total volume of all cylinders in the engine.

horsepower: The horsepower rating of the car's engine, which is a measure of its power output.

weight: The weight of the car, typically measured in pounds. It represents the mass of the vehicle.

acceleration: The time it takes for the car to accelerate from 0 to 60 miles per hour (mph), measured in seconds.

model year: The year when the car model was manufactured.

origin: The origin of the car, usually represented as a numeric code. It could indicate the country or region where the car was manufactured.

car name: The name or identifier of the car model.

Each row in the dataset represents a specific car model and provides values for each of these attributes. The dataset seems to capture various characteristics of cars, such as their fuel efficiency, power, weight, and origin. It can be used for various analyses, such as studying the relationship between fuel efficiency and other factors or predicting the fuel efficiency of a car based on its attributes.

In [4]:
df.info() # horsepower is object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [5]:
# We want to print the non numeric values present in the horsepower column
df[df['horsepower'].str.isnumeric() == False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [6]:
# Replace the '?' with the nan values since it is easier to handle nan values with the inbuilt fns in pandas
df['horsepower'] = df['horsepower'].replace('?',np.nan)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [8]:
# Drop the car name column
df = df.drop('car name',axis = 1)

In [22]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [9]:
# Now we are replacing the nan values with the median value
df = df.apply(lambda x: x.fillna(x.median()),axis = 0)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 25.0+ KB


In [11]:
# Now we are changing the datatype of the horsepower column from object datatype to integer
df['horsepower'] = df['horsepower'].astype('int64')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 25.0 KB


In [13]:
# mpg is the target variable (mileage)
X = df.drop('mpg',axis = 1)
y = df[['mpg']]

In [14]:
# Feature Scaling
from sklearn import preprocessing
# scale all the columns of the df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled,columns = X.columns)

y_scaled = preprocessing.scale(y)
y_scaled = pd.DataFrame(y_scaled,columns = y.columns)

In [15]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.30, random_state=101)

In [20]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score

Linear Regression Model

In [35]:
reg_model = LinearRegression()
reg_model.fit(X_train,y_train)

In [36]:
y_pred = reg_model.predict(X_test)

In [37]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,y_pred)
print("Mean absolute error: ",MAE)
print("Mean squared error: ",MSE)
print("Root Mean squared error: ",RMSE)
print("R2 Score: ",R2)

Mean absolute error:  0.3241909173175946
Mean squared error:  0.1863837702221468
Root Mean squared error:  0.4317218667407835
R2 Score:  0.8187816967291803


In [38]:
print("Intercept: ",reg_model.intercept_[0])
print("Coefficient: ",reg_model.coef_[0])

Intercept:  0.006262146400070239
Coefficient:  [-0.1252293   0.28728837 -0.06096338 -0.73443404  0.0162432   0.37407761
  0.1283999 ]


Regularised Ridge Model

In [39]:
ridge = Ridge(alpha = 0.4) # Coeeficients are prevented to become too big by this alpha value
ridge.fit(X_train,y_train)
for index,col in enumerate(X_train.columns):
  print("Ridge model coefficients for {} is {}".format(col,ridge.coef_[0][index]))

Ridge model coefficients for cylinders is -0.11990142948250304
Ridge model coefficients for displacement is 0.26722520396423227
Ridge model coefficients for horsepower is -0.06431014354712593
Ridge model coefficients for weight is -0.7189679098115279
Ridge model coefficients for acceleration is 0.012623896728626743
Ridge model coefficients for model year is 0.3727995898918599
Ridge model coefficients for origin is 0.12749152686870804


In [40]:
y_pred = ridge.predict(X_test)

In [41]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,y_pred)
print("Mean absolute error: ",MAE)
print("Mean squared error: ",MSE)
print("Root Mean squared error: ",RMSE)
print("R2 Score: ",R2)

Mean absolute error:  0.324172533202639
Mean squared error:  0.18620780320111327
Root Mean squared error:  0.43151802187291466
R2 Score:  0.8189527869745665


Regularised Lasso Model

In [42]:
lasso = Lasso(alpha = 0.1)
lasso.fit(X_train,y_train)
for index,col in enumerate(X_train.columns):
  print("Lasso model coefficients for {} is {}".format(col,lasso.coef_[index]))

Lasso model coefficients for cylinders is -0.0
Lasso model coefficients for displacement is -0.0
Lasso model coefficients for horsepower is -0.0015666442195922045
Lasso model coefficients for weight is -0.6111201488946614
Lasso model coefficients for acceleration is 0.0
Lasso model coefficients for model year is 0.29736103699988103
Lasso model coefficients for origin is 0.039693706968084844


In [43]:
y_pred = lasso.predict(X_test)

In [44]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,y_pred)
print("Mean absolute error: ",MAE)
print("Mean squared error: ",MSE)
print("Root Mean squared error: ",RMSE)
print("R2 Score: ",R2)

Mean absolute error:  0.32751811258182145
Mean squared error:  0.2067144317399362
Root Mean squared error:  0.45465858810753396
R2 Score:  0.7990144821254843


Many of the coefficients have become 0 so we can drop of those dimensions from the model.It has taken only 5 dimensions to build the model. Lasso is also used for feature selection.

In [45]:
print(reg_model.score(X_train,y_train))
print(reg_model.score(X_test,y_test))

0.8205538417382892
0.8187816967291803


In [46]:
print(ridge.score(X_train,y_train))
print(ridge.score(X_test,y_test))

0.8205282611631505
0.8189527869745665


**Accuracy of linear and ridge are more or less same because both coefficients values are similar**

In [47]:
print(lasso.score(X_train,y_train))
print(lasso.score(X_test,y_test))

0.7984891304625561
0.7990144821254843


**Here we can see large values for coefficients. Since dimensions are more with less number of data points this model is clearly indicates the overfit**

Regularised Ridge Model

In [48]:
ridge = Ridge(alpha = 0.3)
ridge.fit(X_train,y_train)
print("Ridge Model: ",ridge.coef_)

Ridge Model:  [[-0.12116623  0.27206948 -0.06348698 -0.72273663  0.01350291  0.37311454
   0.12770919]]


In [49]:
y_pred = ridge.predict(X_test)

In [50]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,y_pred)
print("Mean absolute error: ",MAE)
print("Mean squared error: ",MSE)
print("Root Mean squared error: ",RMSE)
print("R2 Score: ",R2)

Mean absolute error:  0.3241779112560453
Mean squared error:  0.18624315840911088
Root Mean squared error:  0.43155898601362813
R2 Score:  0.8189184116059522


**We can see coefficients are reduced. It will reduce close to zero but not exactly equal to zero. The 0 value shown up above is rounded up value**

In [51]:
print(ridge.score(X_train,y_train))
print(ridge.score(X_train,y_train))

0.8205391743818977
0.8205391743818977


**Regularised Lasso Model**

In [52]:
lasso = Lasso(alpha = 0.01)
lasso.fit(X_train,y_train)
print("Lasso model coefficient: ",lasso.coef_)

Lasso model coefficient:  [-0.          0.         -0.01484911 -0.63689305  0.          0.36487876
  0.09814977]


In [53]:
y_pred = lasso.predict(X_test)

In [54]:
MAE = mean_absolute_error(y_test,y_pred)
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,y_pred)
print("Mean absolute error: ",MAE)
print("Mean squared error: ",MSE)
print("Root Mean squared error: ",RMSE)
print("R2 Score: ",R2)

Mean absolute error:  0.3247079370344667
Mean squared error:  0.18852758960304447
Root Mean squared error:  0.43419763887318
R2 Score:  0.8166972914708129


**Lasso making many coeficients to zero. As mentioned above these are not used for model building**

In [55]:
print(lasso.score(X_train,y_train))
print(lasso.score(X_test,y_test))

0.8168853065346521
0.8166972914708129
