## Pack Import

In [1]:
import pandas as pd

## Data Import

In [2]:
df = pd.read_csv('car_prices.csv')

In [3]:
len(df)

558837

In [3]:
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


## Data Cleaning

#### Drop irrelevant data of price prediction

In [4]:
#Drop car vin number, dealer, going market price, and sale date.
df = df.drop(columns = ['vin', 'seller', 'mmr', 'saledate'])

In [5]:
df.isnull().sum()

year                0
make            10301
model           10399
trim            10651
body            13195
transmission    65352
state               0
condition       11820
odometer           94
color             749
interior          749
sellingprice       12
dtype: int64

In [7]:
df = df.dropna()

In [9]:
max(df['condition'])

49.0

In [7]:
df.isnull().sum()

year            0
make            0
model           0
trim            0
body            0
transmission    0
state           0
condition       0
odometer        0
color           0
interior        0
sellingprice    0
dtype: int64

## Linear Regression Model

#### Package Import

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso, Ridge

#### Simple Linear Regression

In [20]:
dfLin = df.copy()

X = dfLin[['year', 'odometer', 'condition']]
y = dfLin['sellingprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

LRmod = LinearRegression()
LRmod.fit(X_train, y_train)

y_pred = LRmod.predict(X_test)

mseLRmod = mean_squared_error(y_test, y_pred)
r2LRmod = r2_score(y_test, y_pred)
print("MSE: ", mseLRmod)
print("R^2: ", r2LRmod)

MSE:  55328068.27041069
R^2:  0.39415350375946356


#### Simple Linear Reg with Lasso and Ridge

In [21]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print("Lasso MSE: ", mse_lasso)
print("Lasso R^2: ", r2_lasso)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print("Ridge MSE: ", mse_ridge)
print("Ridge R^2: ", r2_ridge)

Lasso MSE:  55328067.85787751
Lasso R^2:  0.3941535082767329
Ridge MSE:  55328068.26192318
Ridge R^2:  0.39415350385240244


#### Linear Regression with One-Hot Encoding On All Features

In [18]:
dfLin = df.copy()

dfLin = pd.get_dummies(
    dfLin, 
    columns = ['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior']
)

X = dfLin.drop('sellingprice', axis=1)
y = dfLin['sellingprice']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

LRmod = LinearRegression()
LRmod.fit(X_train, y_train)

y_pred = LRmod.predict(X_test)

mseLRmod = mean_squared_error(y_test, y_pred)
r2LRmod = r2_score(y_test, y_pred)
print("MSE: ", mseLRmod)
print("R^2: ", r2LRmod)

MSE:  4.390523369385321e+34
R^2:  -4.796338115317337e+26


## Decision Tree

#### Package Import

In [11]:
from sklearn.tree import DecisionTreeRegressor

In [12]:
dfDT = df.copy()

dfDT = pd.get_dummies(
    dfDT, 
    columns=['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior']
)

X = dfDT.drop('sellingprice', axis=1)
y = dfDT['sellingprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

DTmod = DecisionTreeRegressor(random_state = 4)
DTmod.fit(X_train, y_train)

y_pred = DTmod.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  7703670.8518287195


## Random Forest

#### Package Import

In [13]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
dfRF = df.copy()

dfRF = pd.get_dummies(
    dfRF, 
    columns=['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior']
)

X = dfRF.drop('sellingprice', axis=1)
y = dfRF['sellingprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

RFmod = RandomForestRegressor(n_estimators=20, random_state=3)
RFmod.fit(X_train, y_train)

y_pred = RFmod.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

MSE:  4902754.196132467
