## Imports
** Import pandas, numpy, matplotlib,and seaborn. Then set %matplotlib inline 
(You'll import sklearn as you need it.)**

In [77]:
# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

## Get the Data

We'll work with the Ecommerce Customers csv file from the company. It has Customer info, suchas Email, Address, and their color Avatar. Then it also has numerical value columns:

* Avg. Session Length: Average session of in-store style advice sessions.
* Time on App: Average time spent on App in minutes
* Time on Website: Average time spent on Website in minutes
* Length of Membership: How many years the customer has been a member. 

** Read in the Ecommerce Customers csv file as a DataFrame called customers.**

In [78]:
mpg_df = pd.read_csv("auto_mpg.csv")

**Check the head of customers, and check out its info() and describe() methods.**

In [79]:
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [80]:
mpg_df.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [81]:
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [82]:
mpg_df[mpg_df['horsepower'].str.isnumeric()==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [83]:
#We can see there are six values which has ?We can replace with nan values
mpg_df=mpg_df.replace('?',np.nan)

In [84]:
mpg_df[mpg_df['horsepower'].str.isnumeric()==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name


In [85]:
mpg_df=mpg_df.drop('car name',axis=1)

In [86]:
mpg_df_cleaned = mpg_df.dropna()
mpg_df_cleaned.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [87]:
mpg_df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 27.6+ KB


In [88]:
mpg_df_cleaned.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [89]:
#'mpg' is dependent variable so drop it . Copying rest of the columns to X
X = mpg_df_cleaned.drop('mpg', axis=1)

#Copying the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df_cleaned[['mpg']]

In [90]:
#Normalize numeric variables - Scales all numeric features so they are comparable (mean = 0, std = 1).
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [91]:
X_scaled = scaler.fit_transform(X)

In [92]:
X_scaled

array([[ 1.48394702,  1.07728956,  0.66413273, ..., -1.285258  ,
        -1.62531533, -0.71664105],
       [ 1.48394702,  1.48873169,  1.57459447, ..., -1.46672362,
        -1.62531533, -0.71664105],
       [ 1.48394702,  1.1825422 ,  1.18439658, ..., -1.64818924,
        -1.62531533, -0.71664105],
       ...,
       [-0.86401356, -0.56847897, -0.53247413, ..., -1.4304305 ,
         1.63640964, -0.71664105],
       [-0.86401356, -0.7120053 , -0.66254009, ...,  1.11008813,
         1.63640964, -0.71664105],
       [-0.86401356, -0.72157372, -0.58450051, ...,  1.40043312,
         1.63640964, -0.71664105]])

In [93]:
y

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0
...,...
393,27.0
394,44.0
395,32.0
396,28.0


In [94]:
X.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model year', 'origin'],
      dtype='object')

In [95]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

## Simple Linear Model

In [96]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [97]:
#Get results (coefficients and intercept)
print(f"Intercept: {model.intercept_}")
print(f"Coefficients: {model.coef_}")

Intercept: [23.37330962]
Coefficients: [[-0.35090461  2.05244639 -0.06212273 -6.17819931  0.7766379   2.73277799
   1.11092335]]


## Regularized Ridge Model

In [98]:
ridge = Ridge(alpha=.3) #coefficients are prevented to become too big by this alpha value
ridge.fit(X_train,y_train)
print(f"Intercept: {ridge.intercept_}")
print(f"Coefficients: {ridge.coef_}")

Intercept: [23.37110136]
Coefficients: [-0.32716744  1.94288992 -0.0970747  -6.07655493  0.75382603  2.72294205
  1.10582562]


## Regularized LASSO Model

In [99]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print(f"Intercept: {lasso.intercept_}")
print(f"Coefficients: {lasso.coef_}")

Intercept: [23.33668766]
Coefficients: [ 0.          0.         -0.         -4.88114538  0.47086563  2.57204279
  0.82428699]


## Comparing the scores

In [100]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8173705034988943
0.8222890437898112


In [101]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.8173538912305489
0.8227310521458298


In [102]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.8126983398241658
0.8237776647772523


In [103]:
##Make predictions
y_pred_linear = model.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)

In [104]:
#calculate R squared
r2_linear=r2_score(y_test,y_pred_linear)
r2_ridge=r2_score(y_test,y_pred_ridge)
r2_lasso=r2_score(y_test,y_pred_lasso)

In [105]:
r2_linear

0.8222890437898112

In [106]:
r2_ridge

0.8227310521458298

In [107]:
r2_lasso

0.8237776647772523

In [108]:
## Hence R squared is same as model.score or ridge.score or lasso.score