In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('ToyotaCorolla - MLR.csv')
df

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
0,13500,23,46986,Diesel,90,0,2000,3,4,5,1165
1,13750,23,72937,Diesel,90,0,2000,3,4,5,1165
2,13950,24,41711,Diesel,90,0,2000,3,4,5,1165
3,14950,26,48000,Diesel,90,0,2000,3,4,5,1165
4,13750,30,38500,Diesel,90,0,2000,3,4,5,1170
...,...,...,...,...,...,...,...,...,...,...,...
1431,7500,69,20544,Petrol,86,0,1300,3,4,5,1025
1432,10845,72,19000,Petrol,86,0,1300,3,4,5,1015
1433,8500,71,17016,Petrol,86,0,1300,3,4,5,1015
1434,7250,70,16916,Petrol,86,0,1300,3,4,5,1015


In [5]:
df.head()

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
0,13500,23,46986,Diesel,90,0,2000,3,4,5,1165
1,13750,23,72937,Diesel,90,0,2000,3,4,5,1165
2,13950,24,41711,Diesel,90,0,2000,3,4,5,1165
3,14950,26,48000,Diesel,90,0,2000,3,4,5,1165
4,13750,30,38500,Diesel,90,0,2000,3,4,5,1170


In [7]:
df.size

15796

In [9]:
df.shape

(1436, 11)

In [11]:
df.ndim

2

In [13]:
df.describe(include='all')

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
count,1436.0,1436.0,1436.0,1436,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
unique,,,,3,,,,,,,
top,,,,Petrol,,,,,,,
freq,,,,1264,,,,,,,
mean,10730.824513,55.947075,68533.259749,,101.502089,0.05571,1576.85585,4.033426,4.0,5.026462,1072.45961
std,3626.964585,18.599988,37506.448872,,14.98108,0.229441,424.38677,0.952677,0.0,0.18851,52.64112
min,4350.0,1.0,1.0,,69.0,0.0,1300.0,2.0,4.0,3.0,1000.0
25%,8450.0,44.0,43000.0,,90.0,0.0,1400.0,3.0,4.0,5.0,1040.0
50%,9900.0,61.0,63389.5,,110.0,0.0,1600.0,4.0,4.0,5.0,1070.0
75%,11950.0,70.0,87020.75,,110.0,0.0,1600.0,5.0,4.0,5.0,1085.0


# Step 1: EDA

In [16]:
df.isnull().sum()

Price        0
Age_08_04    0
KM           0
Fuel_Type    0
HP           0
Automatic    0
cc           0
Doors        0
Cylinders    0
Gears        0
Weight       0
dtype: int64

In [18]:
df.duplicated().sum()

1

In [20]:
df.drop_duplicates(inplace=True)

In [22]:
df.duplicated().sum()

0

In [24]:
#Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, columns=['Fuel_Type'], drop_first=True)

In [26]:
df

Unnamed: 0,Price,Age_08_04,KM,HP,Automatic,cc,Doors,Cylinders,Gears,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol
0,13500,23,46986,90,0,2000,3,4,5,1165,True,False
1,13750,23,72937,90,0,2000,3,4,5,1165,True,False
2,13950,24,41711,90,0,2000,3,4,5,1165,True,False
3,14950,26,48000,90,0,2000,3,4,5,1165,True,False
4,13750,30,38500,90,0,2000,3,4,5,1170,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1431,7500,69,20544,86,0,1300,3,4,5,1025,False,True
1432,10845,72,19000,86,0,1300,3,4,5,1015,False,True
1433,8500,71,17016,86,0,1300,3,4,5,1015,False,True
1434,7250,70,16916,86,0,1300,3,4,5,1015,False,True


# Step 2: Splitting the dataset into training and testing sets

In [29]:
target=df[['Price']]
feature=df.drop('Price',axis=1)

In [31]:
x_train,x_test,y_train,y_test=train_test_split(feature,target,train_size=0.2,random_state=42)

In [33]:
print(x_train)
print(x_test)
print(y_train)
print(y_test)

      Age_08_04      KM   HP  Automatic    cc  Doors  Cylinders  Gears  \
659          66  112000  110          0  1600      3          4      6   
1173         79   95000   86          0  1300      5          4      5   
1020         58   34000  110          0  1600      5          4      5   
1202         73   88186   72          0  2000      3          4      5   
960          66   50806   86          0  1300      3          4      5   
...         ...     ...  ...        ...   ...    ...        ...    ...   
1096         75  125400  110          0  1600      3          4      5   
1131         75  107000  110          0  1600      3          4      5   
1295         80   71500  110          0  1600      4          4      5   
861          65   64630  110          0  1600      5          4      5   
1127         75  109540  110          0  1600      5          4      5   

      Weight  Fuel_Type_Diesel  Fuel_Type_Petrol  
659     1065             False              True  
1173    1

# Step 3: Building multiple linear regression models

In [36]:
models = []

In [38]:
#Model 1: Ordinary Least Squares (OLS) Regression
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
models.append(('OLS', lin_reg))

In [40]:
# Model 2: Lasso Regression
model_lasso = Lasso(alpha=0.1)
model_lasso.fit(x_train, y_train)
models.append(('Lasso', model_lasso))

In [42]:
# Model 3: Ridge Regression
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(x_train, y_train)
models.append(('Ridge', model_ridge))

# Step 4:Evaluating the performance of the models

In [45]:
for name, model in models:
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - Mean Squared Error: {mse}, R^2 Score: {r2}")

OLS - Mean Squared Error: 4946773.798224466, R^2 Score: 0.6210380569786085
Lasso - Mean Squared Error: 4928333.080693121, R^2 Score: 0.6224507615880037
Ridge - Mean Squared Error: 3789203.4891925557, R^2 Score: 0.7097170852479135


# Step 5: Applying Lasso and Ridge methods on the model

In [48]:
# Lasso model
lasso=Lasso(alpha=1)

In [50]:
lasso.fit(x_train,y_train)

In [52]:
lasso_coefficient=lasso.coef_

In [54]:
y_predict=lasso.predict(x_test)

In [56]:
r2_score(y_test,y_predict)

0.6349712037344809

In [58]:
print("Lasso Regression Coefficients:")
for feature, coef in zip(feature.columns, lasso_coefficient):
    print(f"{feature}: {coef}")

Lasso Regression Coefficients:
Age_08_04: -133.17471287851316
KM: -0.014504606132660415
HP: 70.39730143086189
Automatic: 1449.65003752114
cc: -3.9240910832214637
Doors: 150.1196999249192
Cylinders: 0.0
Gears: 1009.3102555776131
Weight: 10.950323664352318
Fuel_Type_Diesel: 3143.776389624027
Fuel_Type_Petrol: 847.0187707507896


In [60]:
# Ridge model
ridge=Ridge(alpha=1)

In [62]:
ridge.fit(x_train,y_train)

In [64]:
ridge_coefficient=ridge.coef_

In [66]:
# to check accuracy
y_predict1=ridge.predict(x_test)

In [68]:
r2_score(y_test,y_predict1)

0.7097170852479135

# Interview Questions

# 1. Normalization & Standardization
 Normalization scales the features to a range between 0 and 1, whereas standardization scales the features so that they have mean 0 and standard deviation 1. # Both techniques help in improving the performance and convergence of machine learning algorithms, especially those based on gradient descent.
 
# 2. Techniques to address multicollinearity
 - Remove one of the correlated variables.
 - Combine the correlated variables into a single feature.
 - Use dimensionality reduction techniques like Principal Component Analysis (PCA).
 - Regularization techniques like Lasso and Ridge regression.

 # Assumptions made during the analysis:
  - The dataset is assumed to be representative of the population.
  - There are no significant outliers in the dataset.
  - The relationship between the independent variables and the dependent variable is linear.

    # Implications of assumptions:
 Violation of these assumptions may lead to biased estimates and incorrect inferences. It's important to validate these assumptions before interpreting the results.