## Linear Regression

### Predicting price of pre-owned cars

In [12]:
# Importing necessary packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [13]:
# Reading the data
cars_data = pd.read_csv("cars_sampled.csv")
cars = cars_data.copy() # Taking a copy of the dataframe

In [14]:
print(cars.shape) # Shape of the dataframe

(50001, 19)


In [15]:
# Basic info
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50001 entries, 0 to 50000
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50001 non-null  object
 1   name                 50001 non-null  object
 2   seller               50001 non-null  object
 3   offerType            50001 non-null  object
 4   price                50001 non-null  int64 
 5   abtest               50001 non-null  object
 6   vehicleType          44813 non-null  object
 7   yearOfRegistration   50001 non-null  int64 
 8   gearbox              47177 non-null  object
 9   powerPS              50001 non-null  int64 
 10  model                47243 non-null  object
 11  kilometer            50001 non-null  int64 
 12  monthOfRegistration  50001 non-null  int64 
 13  fuelType             45498 non-null  object
 14  brand                50001 non-null  object
 15  notRepairedDamage    40285 non-null  object
 16  date

In [16]:
# Setting options for legible output display
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
cars.describe()

Unnamed: 0,price,yearOfRegistration,powerPS,kilometer,monthOfRegistration,postalCode
count,50001.0,50001.0,50001.0,50001.0,50001.0,50001.0
mean,6559.865,2005.544,116.496,125613.688,5.744,50775.217
std,85818.47,122.992,230.568,40205.234,3.711,25743.702
min,0.0,1000.0,0.0,5000.0,0.0,1067.0
25%,1150.0,1999.0,69.0,125000.0,3.0,30559.0
50%,2950.0,2003.0,105.0,150000.0,6.0,49504.0
75%,7190.0,2008.0,150.0,150000.0,9.0,71404.0
max,12345678.0,9999.0,19312.0,150000.0,12.0,99998.0


In [17]:
cars.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,postalCode,lastSeen
0,30/03/2016 13:51,Zu_verkaufen,private,offer,4450,test,limousine,2003,manual,150,3er,150000,3,diesel,bmw,,30/03/2016 00:00,20257,07/04/2016 04:44
1,07/03/2016 09:54,Volvo_XC90_2.4D_Summum,private,offer,13299,control,suv,2005,manual,163,xc_reihe,150000,6,diesel,volvo,no,07/03/2016 00:00,88045,26/03/2016 13:17
2,01/04/2016 00:57,Volkswagen_Touran,private,offer,3200,test,bus,2003,manual,101,touran,150000,11,diesel,volkswagen,,31/03/2016 00:00,27449,01/04/2016 08:40
3,19/03/2016 17:50,Seat_Ibiza_1.4_16V_Reference,private,offer,4500,control,small car,2006,manual,86,ibiza,60000,12,petrol,seat,no,19/03/2016 00:00,34537,07/04/2016 04:44
4,16/03/2016 14:51,Volvo_XC90_D5_Aut._RDesign_R_Design_AWD_GSHD_S...,private,offer,18750,test,suv,2008,automatic,185,xc_reihe,150000,11,diesel,volvo,no,16/03/2016 00:00,55270,01/04/2016 23:18


In [18]:
# Dropping unwanted columns
col = ['name', 'dateCrawled', 'dateCreated', 'postalCode', 'lastSeen']
cars = cars.drop(columns = col, axis = 1)

In [19]:
# Missing values across each column
print("Column-wise missing values\n")
print(cars.isnull().sum())

Column-wise missing values

seller                    0
offerType                 0
price                     0
abtest                    0
vehicleType            5188
yearOfRegistration        0
gearbox                2824
powerPS                   0
model                  2758
kilometer                 0
monthOfRegistration       0
fuelType               4503
brand                     0
notRepairedDamage      9716
dtype: int64


### Data Cleaning

In [20]:
# Working range of data

cars = cars[(cars.yearOfRegistration <= 2016) &
            (cars.yearOfRegistration >= 1950) &
            (cars.price >= 100) &
            (cars.price <= 150000) &
            (cars.powerPS >= 10) &
            (cars.powerPS <= 500)]

In [21]:
cars.shape # Shape of the dataframe

(41785, 14)

In [22]:
cars.head()

Unnamed: 0,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage
0,private,offer,4450,test,limousine,2003,manual,150,3er,150000,3,diesel,bmw,
1,private,offer,13299,control,suv,2005,manual,163,xc_reihe,150000,6,diesel,volvo,no
2,private,offer,3200,test,bus,2003,manual,101,touran,150000,11,diesel,volkswagen,
3,private,offer,4500,control,small car,2006,manual,86,ibiza,60000,12,petrol,seat,no
4,private,offer,18750,test,suv,2008,automatic,185,xc_reihe,150000,11,diesel,volvo,no


In [23]:
# Further to simplify - variable reduction
# Combining yearOfRegistration and monthOfRegistration

cars['monthOfRegistration'] /= 12
# cars['monthOfRegistration'] = cars['monthOfRegistration'] / 12

# Creating new varible Age by adding yearOfRegistration and monthOfRegistration

cars['Age'] = (2016 - cars['yearOfRegistration']) + cars['monthOfRegistration']
cars['Age'] = round(cars['Age'], 2)

cars['Age'].describe()

count   41785.000
mean       13.314
std         6.742
min         0.000
25%         8.750
50%        13.080
75%        17.330
max        65.750
Name: Age, dtype: float64

In [24]:
# Dropping yearOfRegistration and monthOfRegistration

cars.drop(columns = ['yearOfRegistration','monthOfRegistration'], axis = 1, inplace = True)

In [31]:
#calculate the percentage of Diesel cars
pd.crosstab(cars['fuelType'], columns= 'count', normalize=True)

col_0,count
fuelType,Unnamed: 1_level_1
cng,0.002
diesel,0.323
electro,0.0
hybrid,0.001
lpg,0.017
other,0.0
petrol,0.657


### We are going to build a Linear Regression model on data obtained by omitting rows with any missing value

In [27]:
# Omitting missing values
cars_omit = cars.dropna(axis = 0)

### Linear Regression

  - Model - price ~ powerPS + kilometer + Age

In [None]:
# Separating input and output features

x1 = cars_omit.filter(['powerPS', 'kilometer', 'Age'], axis = 1)
y1 = cars_omit.filter(['price'], axis = 1)

In [None]:
# Splitting data into test and train


X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size = 0.3, 
                                                    random_state = 3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Create the model
model = LinearRegression()

# Fit the model using the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
Y_pred = model.predict(X_test)

In [None]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, Y_pred)

# Calculate R-squared
r2 = r2_score(y_test, Y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
# Output the coefficients and intercept
print(f"Intercept: {model.intercept_}")
print(f"Coefficients: {model.coef_}")

In [None]:
x2 = cars_omit.filter(['powerPS', 'kilometer', 'Age','vehicleType','gearbox','fuelType','notRepairedDamage']
                      , axis = 1)
y2 = cars_omit.filter(['price'], axis = 1)
x2 = pd.get_dummies(x2, drop_first = True)

In [None]:
x2.columns

In [None]:
# Splitting data into test and train
X2_train, X2_test, y2_train, y2_test = train_test_split(x2, y2, test_size = 0.3, random_state = 3)
print(X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape)

In [None]:
# Create the model
model = LinearRegression()

# Fit the model using the training data
model.fit(X2_train, y2_train)

In [None]:
# Make predictions on the test set
Y2_pred = model.predict(X2_test)

In [None]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, Y_pred)

# Calculate R-squared
r2 = r2_score(y_test, Y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")