In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# read data
df = pd.read_csv('/content/car.csv')
df.head(2)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [4]:
df.drop('name', axis=1, inplace=True)
df.head(2)

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner


In [5]:
y = df['selling_price']
y

0        60000
1       135000
2       600000
3       250000
4       450000
         ...  
4335    409999
4336    409999
4337    110000
4338    865000
4339    225000
Name: selling_price, Length: 4340, dtype: int64

In [6]:
#One-Hot Encoding,
X = df[['fuel', 'seller_type', 'transmission', 'owner']]
X = pd.get_dummies(data=X, drop_first=True)
X = df[['year','km_driven']].join(X)
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


In [7]:
reg = LinearRegression().fit(X,y)
print(reg.intercept_)
print(reg.coef_)

-69709014.61437906
[ 3.52569023e+04 -9.59124882e-01  2.86332040e+05 -6.05933534e+05
  4.70046387e+04 -4.24521649e+03 -6.63832691e+04  1.67541023e+05
 -8.70336594e+05 -1.45440549e+03 -4.09321739e+04  1.68678588e+05
 -3.99275885e+04]


## Make a prediction for a car with the following characteristics: 
Diesel, \
seller_type: Individual, \
owner: Second_owner, \
year; 2007, \
km_driven: 100000, \
transmission: manual \

In [8]:
pd.Series(X.columns)

0                             year
1                        km_driven
2                      fuel_Diesel
3                    fuel_Electric
4                         fuel_LPG
5                      fuel_Petrol
6           seller_type_Individual
7     seller_type_Trustmark Dealer
8              transmission_Manual
9       owner_Fourth & Above Owner
10              owner_Second Owner
11            owner_Test Drive Car
12               owner_Third Owner
dtype: object

In [9]:
reg.predict([[1,0,0,0,1,0,1,0,1,0,0,2007,100000]])



array([-3.7247844e+09])

## **Statsmodels** - generating a report about the model

In [10]:
import statsmodels.api as sm

In [11]:
y = df['selling_price']
X = df[['fuel', 'seller_type', 'transmission', 'owner']]
# One-Hot Encoding,
X = pd.get_dummies(data=X, drop_first=True)
X = X.join(df[['km_driven']])

In [12]:
results = sm.OLS(y,X).fit()
results.summary()  

0,1,2,3
Dep. Variable:,selling_price,R-squared (uncentered):,0.64
Model:,OLS,Adj. R-squared (uncentered):,0.639
Method:,Least Squares,F-statistic:,641.1
Date:,"Wed, 03 May 2023",Prob (F-statistic):,0.0
Time:,12:52:15,Log-Likelihood:,-62751.0
No. Observations:,4340,AIC:,125500.0
Df Residuals:,4328,BIC:,125600.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
fuel_Diesel,1.574e+06,2.47e+04,63.680,0.000,1.53e+06,1.62e+06
fuel_Electric,5.511e+05,4.61e+05,1.194,0.232,-3.53e+05,1.46e+06
fuel_LPG,1.227e+06,9.96e+04,12.320,0.000,1.03e+06,1.42e+06
fuel_Petrol,1.219e+06,2.42e+04,50.339,0.000,1.17e+06,1.27e+06
seller_type_Individual,-2.82e+04,1.78e+04,-1.589,0.112,-6.3e+04,6597.202
seller_type_Trustmark Dealer,2.479e+05,4.8e+04,5.164,0.000,1.54e+05,3.42e+05
transmission_Manual,-7.878e+05,2.29e+04,-34.338,0.000,-8.33e+05,-7.43e+05
owner_Fourth & Above Owner,-1.682e+05,5.27e+04,-3.191,0.001,-2.72e+05,-6.49e+04
owner_Second Owner,-1.253e+05,1.72e+04,-7.277,0.000,-1.59e+05,-9.15e+04

0,1,2,3
Omnibus:,4208.135,Durbin-Watson:,1.921
Prob(Omnibus):,0.0,Jarque-Bera (JB):,350158.429
Skew:,4.49,Prob(JB):,0.0
Kurtosis:,46.078,Cond. No.,5340000.0


### Splitting data to train and test dataset